332 files changed, 56032 insertions, 5313 deletions
diff --git a/src/gallium/Makefile b/src/gallium/Makefile
index aa77021daf3..36bd3623e7f 100644
--- a/src/gallium/Makefile
+++ b/src/gallium/Makefile
@@ -3,6 +3,7 @@ include $(TOP)/configs/current
 
 
 SUBDIRS = auxiliary drivers
+# Note winsys/ needs to be built after src/mesa
 
 
 default: subdirs
diff --git a/src/gallium/auxiliary/cso_cache/cso_hash.c b/src/gallium/auxiliary/cso_cache/cso_hash.c
index 7f0044c5a7f..4e7664f9bf0 100644
--- a/src/gallium/auxiliary/cso_cache/cso_hash.c
+++ b/src/gallium/auxiliary/cso_cache/cso_hash.c
@@ -431,3 +431,9 @@ struct cso_hash_iter cso_hash_erase(struct cso_hash *hash, struct cso_hash_iter
    --hash->data.d->size;
    return ret;
 }
+
+boolean cso_hash_contains(struct cso_hash *hash, unsigned key)
+{
+   struct cso_node **node = cso_hash_find_node(hash, key);
+   return (*node != hash->data.e);
+}
diff --git a/src/gallium/auxiliary/cso_cache/cso_hash.h b/src/gallium/auxiliary/cso_cache/cso_hash.h
index 85f3e276c6a..5891c325fa5 100644
--- a/src/gallium/auxiliary/cso_cache/cso_hash.h
+++ b/src/gallium/auxiliary/cso_cache/cso_hash.h
@@ -44,6 +44,7 @@
 #ifndef CSO_HASH_H
 #define CSO_HASH_H
 
+#include "pipe/p_compiler.h"
 
 #ifdef	__cplusplus
 extern "C" {
@@ -95,6 +96,11 @@ struct cso_hash_iter cso_hash_first_node(struct cso_hash *hash);
  */
 struct cso_hash_iter cso_hash_find(struct cso_hash *hash, unsigned key);
 
+/**
+ * Returns true if a value with the given key exists in the hash
+ */
+boolean   cso_hash_contains(struct cso_hash *hash, unsigned key);
+
 
 int       cso_hash_iter_is_null(struct cso_hash_iter iter);
 unsigned  cso_hash_iter_key(struct cso_hash_iter iter);
diff --git a/src/gallium/auxiliary/draw/Makefile b/src/gallium/auxiliary/draw/Makefile
index f2e36a89e90..bdbf5a08ede 100644
--- a/src/gallium/auxiliary/draw/Makefile
+++ b/src/gallium/auxiliary/draw/Makefile
@@ -40,6 +40,7 @@ C_SOURCES = \
 	draw_vs_aos_machine.c \
 	draw_vs_exec.c \
 	draw_vs_llvm.c \
+	draw_vs_ppc.c  \
 	draw_vs_sse.c 
 
 
diff --git a/src/gallium/auxiliary/draw/SConscript b/src/gallium/auxiliary/draw/SConscript
index 544a04918b6..5f05aa324a5 100644
--- a/src/gallium/auxiliary/draw/SConscript
+++ b/src/gallium/auxiliary/draw/SConscript
@@ -38,6 +38,7 @@ draw = env.ConvenienceLibrary(
 		'draw_vs_aos_machine.c',
 		'draw_vs_exec.c',
 		'draw_vs_llvm.c',
+		'draw_vs_ppc.c',
 		'draw_vs_sse.c',
 		'draw_vs_varient.c'
 	])
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 37c4c87f87f..5d531146c5f 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -196,7 +196,7 @@ struct draw_context
 
       const float (*aligned_constants)[4];
 
-      float (*aligned_constant_storage)[4];
+      const float (*aligned_constant_storage)[4];
       unsigned const_storage_size;
 
 
diff --git a/src/gallium/auxiliary/draw/draw_vs.c b/src/gallium/auxiliary/draw/draw_vs.c
index 34adbd49b00..7f305304ff7 100644
--- a/src/gallium/auxiliary/draw/draw_vs.c
+++ b/src/gallium/auxiliary/draw/draw_vs.c
@@ -85,7 +85,10 @@ draw_create_vertex_shader(struct draw_context *draw,
    if (!vs) {
       vs = draw_create_vs_sse( draw, shader );
       if (!vs) {
-         vs = draw_create_vs_exec( draw, shader );
+         vs = draw_create_vs_ppc( draw, shader );
+         if (!vs) {
+            vs = draw_create_vs_exec( draw, shader );
+         }
       }
    }
 
diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h
index 68c24abad3b..89ae158751a 100644
--- a/src/gallium/auxiliary/draw/draw_vs.h
+++ b/src/gallium/auxiliary/draw/draw_vs.h
@@ -158,6 +158,10 @@ draw_create_vs_sse(struct draw_context *draw,
 		   const struct pipe_shader_state *templ);
 
 struct draw_vertex_shader *
+draw_create_vs_ppc(struct draw_context *draw,
+		   const struct pipe_shader_state *templ);
+
+struct draw_vertex_shader *
 draw_create_vs_llvm(struct draw_context *draw,
 		    const struct pipe_shader_state *templ);
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index 82d27d44934..80c36066577 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -62,12 +62,15 @@ vs_exec_prepare( struct draw_vertex_shader *shader,
 {
    struct exec_vertex_shader *evs = exec_vertex_shader(shader);
 
-   /* specify the vertex program to interpret/execute */
-   tgsi_exec_machine_bind_shader(evs->machine,
-				 shader->state.tokens,
-				 PIPE_MAX_SAMPLERS,
-				 NULL /*samplers*/ );
-
+   /* Specify the vertex program to interpret/execute.
+    * Avoid rebinding when possible.
+    */
+   if (evs->machine->Tokens != shader->state.tokens) {
+      tgsi_exec_machine_bind_shader(evs->machine,
+                                    shader->state.tokens,
+                                    PIPE_MAX_SAMPLERS,
+                                    NULL /*samplers*/ );
+   }
 }
 
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
new file mode 100644
index 00000000000..8b751361449
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -0,0 +1,239 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <[email protected]>
+  *   Brian Paul
+  */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_config.h"
+
+#include "draw_vs.h"
+
+#if defined(PIPE_ARCH_PPC)
+
+#include "pipe/p_shader_tokens.h"
+
+#include "draw_private.h"
+#include "draw_context.h"
+
+#include "rtasm/rtasm_cpu.h"
+#include "rtasm/rtasm_ppc.h"
+#include "tgsi/tgsi_ppc.h"
+#include "tgsi/tgsi_parse.h"
+
+
+
+typedef void (PIPE_CDECL *codegen_function) (float (*inputs)[4][4],
+                                             float (*outputs)[4][4],
+                                             float (*temps)[4][4],
+                                             float (*immeds)[4],
+                                             float (*consts)[4],
+                                             const float *builtins);
+
+
+struct draw_ppc_vertex_shader {
+   struct draw_vertex_shader base;
+   struct ppc_function ppc_program;
+
+   codegen_function func;
+};
+
+
+static void
+vs_ppc_prepare( struct draw_vertex_shader *base,
+		struct draw_context *draw )
+{
+   /* nothing */
+}
+
+
+/**
+ * Simplified vertex shader interface for the pt paths.  Given the
+ * complexity of code-generating all the above operations together,
+ * it's time to try doing all the other stuff separately.
+ */
+static void
+vs_ppc_run_linear( struct draw_vertex_shader *base,
+		   const float (*input)[4],
+		   float (*output)[4],
+		   const float (*constants)[4],
+		   unsigned count,
+		   unsigned input_stride,
+		   unsigned output_stride )
+{
+   struct draw_ppc_vertex_shader *shader = (struct draw_ppc_vertex_shader *)base;
+   unsigned int i;
+
+#define MAX_VERTICES 4
+
+   /* loop over verts */
+   for (i = 0; i < count; i += MAX_VERTICES) {
+      const uint max_vertices = MIN2(MAX_VERTICES, count - i);
+      float inputs_soa[PIPE_MAX_SHADER_INPUTS][4][4] ALIGN16_ATTRIB;
+      float outputs_soa[PIPE_MAX_SHADER_OUTPUTS][4][4] ALIGN16_ATTRIB;
+      float temps_soa[TGSI_EXEC_NUM_TEMPS][4][4] ALIGN16_ATTRIB;
+      uint attr;
+
+      /* convert (up to) four input verts to SoA format */
+      for (attr = 0; attr < base->info.num_inputs; attr++) {
+         const float *vIn = (const float *) input;
+         uint vert;
+         for (vert = 0; vert < max_vertices; vert++) {
+#if 0
+            if (attr==0)
+               printf("Input v%d a%d: %f %f %f %f\n",
+                      vert, attr, vIn[0], vIn[1], vIn[2], vIn[3]);
+#endif
+            inputs_soa[attr][0][vert] = vIn[attr * 4 + 0];
+            inputs_soa[attr][1][vert] = vIn[attr * 4 + 1];
+            inputs_soa[attr][2][vert] = vIn[attr * 4 + 2];
+            inputs_soa[attr][3][vert] = vIn[attr * 4 + 3];
+            vIn += input_stride / 4;
+         }
+      }
+
+      /* run compiled shader
+       */
+      shader->func(inputs_soa, outputs_soa, temps_soa,
+		   (float (*)[4]) shader->base.immediates,
+		   (float (*)[4]) constants,
+                   ppc_builtin_constants);
+
+      /* convert (up to) four output verts from SoA back to AoS format */
+      for (attr = 0; attr < base->info.num_outputs; attr++) {
+         float *vOut = (float *) output;
+         uint vert;
+         for (vert = 0; vert < max_vertices; vert++) {
+            vOut[attr * 4 + 0] = outputs_soa[attr][0][vert];
+            vOut[attr * 4 + 1] = outputs_soa[attr][1][vert];
+            vOut[attr * 4 + 2] = outputs_soa[attr][2][vert];
+            vOut[attr * 4 + 3] = outputs_soa[attr][3][vert];
+#if 0
+            if (attr==0)
+               printf("Output v%d a%d: %f %f %f %f\n",
+                      vert, attr, vOut[0], vOut[1], vOut[2], vOut[3]);
+#endif
+            vOut += output_stride / 4;
+         }
+      }
+
+      /* advance to next group of four input/output verts */
+      input = (const float (*)[4])((const char *)input + input_stride * max_vertices);
+      output = (float (*)[4])((char *)output + output_stride * max_vertices);
+   }
+}
+
+
+static void
+vs_ppc_delete( struct draw_vertex_shader *base )
+{
+   struct draw_ppc_vertex_shader *shader = (struct draw_ppc_vertex_shader *)base;
+   
+   ppc_release_func( &shader->ppc_program );
+
+   align_free( (void *) shader->base.immediates );
+
+   FREE( (void*) shader->base.state.tokens );
+   FREE( shader );
+}
+
+
+struct draw_vertex_shader *
+draw_create_vs_ppc(struct draw_context *draw,
+                   const struct pipe_shader_state *templ)
+{
+   struct draw_ppc_vertex_shader *vs;
+
+   vs = CALLOC_STRUCT( draw_ppc_vertex_shader );
+   if (vs == NULL) 
+      return NULL;
+
+   /* we make a private copy of the tokens */
+   vs->base.state.tokens = tgsi_dup_tokens(templ->tokens);
+   if (!vs->base.state.tokens)
+      goto fail;
+
+   tgsi_scan_shader(templ->tokens, &vs->base.info);
+
+   vs->base.draw = draw;
+#if 0
+   if (1)
+      vs->base.create_varient = draw_vs_varient_aos_ppc;
+   else
+#endif
+      vs->base.create_varient = draw_vs_varient_generic;
+   vs->base.prepare = vs_ppc_prepare;
+   vs->base.run_linear = vs_ppc_run_linear;
+   vs->base.delete = vs_ppc_delete;
+   
+   vs->base.immediates = align_malloc(TGSI_EXEC_NUM_IMMEDIATES * 4 *
+                                      sizeof(float), 16);
+
+   ppc_init_func( &vs->ppc_program );
+
+   if (!tgsi_emit_ppc( (struct tgsi_token *) vs->base.state.tokens,
+			&vs->ppc_program, 
+                       (float (*)[4]) vs->base.immediates, 
+                        TRUE )) 
+      goto fail;
+      
+   vs->func = (codegen_function) ppc_get_func( &vs->ppc_program );
+   if (!vs->func) {
+      goto fail;
+   }
+   
+   return &vs->base;
+
+fail:
+   /*
+   debug_error("tgsi_emit_ppc() failed, falling back to interpreter\n");
+   */
+
+   ppc_release_func( &vs->ppc_program );
+   
+   FREE(vs);
+   return NULL;
+}
+
+
+
+#else /* PIPE_ARCH_PPC */
+
+
+struct draw_vertex_shader *
+draw_create_vs_ppc( struct draw_context *draw,
+		    const struct pipe_shader_state *templ )
+{
+   return (void *) 0;
+}
+
+
+#endif /* PIPE_ARCH_PPC */
diff --git a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
index 3a4a41e5445..93a9748bdb3 100644
--- a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
+++ b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
@@ -158,8 +158,8 @@ void gallivm_cpu_jit_compile(struct gallivm_cpu_engine *cpu, struct gallivm_prog
    llvm::ExistingModuleProvider *mp = new llvm::ExistingModuleProvider(mod);
    llvm::ExecutionEngine *ee = cpu->engine;
    assert(ee);
-   /*FIXME : remove */
-   ee->DisableLazyCompilation();
+   /*FIXME : why was this disabled ? we need it for pow/sqrt/... */
+   ee->DisableLazyCompilation(false);
    ee->addModuleProvider(mp);
 
    llvm::Function *func = func_for_shader(prog);
@@ -179,8 +179,7 @@ struct gallivm_cpu_engine * gallivm_global_cpu_engine()
 
 typedef void (*vertex_shader_runner)(void *ainputs,
                                      void *dests,
-                                     float (*aconsts)[4],
-                                     void *temps);
+                                     float (*aconsts)[4]);
 
 #define MAX_TGSI_VERTICES 4
 /*!
@@ -202,7 +201,6 @@ int gallivm_cpu_vs_exec(struct gallivm_prog *prog,
    unsigned int i, j;
    unsigned slot;
    vertex_shader_runner runner = reinterpret_cast<vertex_shader_runner>(prog->function);
-
    assert(runner);
 
    for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
@@ -224,8 +222,7 @@ int gallivm_cpu_vs_exec(struct gallivm_prog *prog,
       /* run shader */
       runner(machine->Inputs,
              machine->Outputs,
-             (float (*)[4]) constants,
-             machine->Temps);
+             (float (*)[4]) constants);
 
       /* Unswizzle all output results
        */
diff --git a/src/gallium/auxiliary/gallivm/gallivm_p.h b/src/gallium/auxiliary/gallivm/gallivm_p.h
index ebf3e11cd56..d2c5852bdf7 100644
--- a/src/gallium/auxiliary/gallivm/gallivm_p.h
+++ b/src/gallium/auxiliary/gallivm/gallivm_p.h
@@ -101,10 +101,10 @@ static INLINE int gallivm_w_swizzle(int swizzle)
    return w;
 }
 
-#endif /* MESA_LLVM */
-
 #if defined __cplusplus
 }
 #endif
 
+#endif /* MESA_LLVM */
+
 #endif
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.cpp b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
index a6580725512..d5600fd22da 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.cpp
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
@@ -90,68 +90,11 @@ llvm::Value * InstructionsSoa::vectorFromVals(llvm::Value *x, llvm::Value *y,
    return res;
 }
 
-std::vector<llvm::Value*> InstructionsSoa::arl(const std::vector<llvm::Value*> in)
-{
-   std::vector<llvm::Value*> res(4);
-
-   //Extract x's
-   llvm::Value *x1 = m_builder.CreateExtractElement(in[0],
-                                                    m_storage->constantInt(0),
-                                                    name("extractX"));
-   //cast it to an unsigned int
-   x1 = m_builder.CreateFPToUI(x1, IntegerType::get(32), name("x1IntCast"));
-
-   res[0] = x1;//vectorFromVals(x1, x2, x3, x4);
-   //only x is valid. the others shouldn't be necessary
-   /*
-   res[1] = Constant::getNullValue(m_floatVecType);
-   res[2] = Constant::getNullValue(m_floatVecType);
-   res[3] = Constant::getNullValue(m_floatVecType);
-   */
-
-   return res;
-}
-
-
-std::vector<llvm::Value*> InstructionsSoa::add(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   std::vector<llvm::Value*> res(4);
-
-   res[0] = m_builder.CreateAdd(in1[0], in2[0], name("addx"));
-   res[1] = m_builder.CreateAdd(in1[1], in2[1], name("addy"));
-   res[2] = m_builder.CreateAdd(in1[2], in2[2], name("addz"));
-   res[3] = m_builder.CreateAdd(in1[3], in2[3], name("addw"));
-
-   return res;
-}
-
-std::vector<llvm::Value*> InstructionsSoa::mul(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   std::vector<llvm::Value*> res(4);
-
-   res[0] = m_builder.CreateMul(in1[0], in2[0], name("mulx"));
-   res[1] = m_builder.CreateMul(in1[1], in2[1], name("muly"));
-   res[2] = m_builder.CreateMul(in1[2], in2[2], name("mulz"));
-   res[3] = m_builder.CreateMul(in1[3], in2[3], name("mulw"));
-
-   return res;
-}
-
 void InstructionsSoa::end()
 {
    m_builder.CreateRetVoid();
 }
 
-std::vector<llvm::Value*> InstructionsSoa::madd(const std::vector<llvm::Value*> in1,
-                                                const std::vector<llvm::Value*> in2,
-                                                const std::vector<llvm::Value*> in3)
-{
-   std::vector<llvm::Value*> res = mul(in1, in2);
-   return add(res, in3);
-}
-
 std::vector<llvm::Value*> InstructionsSoa::extractVector(llvm::Value *vector)
 {
    std::vector<llvm::Value*> res(4);
@@ -171,6 +114,11 @@ std::vector<llvm::Value*> InstructionsSoa::extractVector(llvm::Value *vector)
    return res;
 }
 
+llvm::IRBuilder<>* InstructionsSoa::getIRBuilder()
+{
+   return &m_builder;
+}
+
 void InstructionsSoa::createFunctionMap()
 {
    m_functionsMap[TGSI_OPCODE_ABS]   = "abs";
@@ -274,6 +222,41 @@ std::vector<llvm::Value*> InstructionsSoa::abs(const std::vector<llvm::Value*> i
    return callBuiltin(func, in1);
 }
 
+std::vector<llvm::Value*> InstructionsSoa::add(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   std::vector<llvm::Value*> res(4);
+
+   res[0] = m_builder.CreateAdd(in1[0], in2[0], name("addx"));
+   res[1] = m_builder.CreateAdd(in1[1], in2[1], name("addy"));
+   res[2] = m_builder.CreateAdd(in1[2], in2[2], name("addz"));
+   res[3] = m_builder.CreateAdd(in1[3], in2[3], name("addw"));
+
+   return res;
+}
+
+std::vector<llvm::Value*> InstructionsSoa::arl(const std::vector<llvm::Value*> in)
+{
+   std::vector<llvm::Value*> res(4);
+
+   //Extract x's
+   llvm::Value *x1 = m_builder.CreateExtractElement(in[0],
+                                                    m_storage->constantInt(0),
+                                                    name("extractX"));
+   //cast it to an unsigned int
+   x1 = m_builder.CreateFPToUI(x1, IntegerType::get(32), name("x1IntCast"));
+
+   res[0] = x1;//vectorFromVals(x1, x2, x3, x4);
+   //only x is valid. the others shouldn't be necessary
+   /*
+   res[1] = Constant::getNullValue(m_floatVecType);
+   res[2] = Constant::getNullValue(m_floatVecType);
+   res[3] = Constant::getNullValue(m_floatVecType);
+   */
+
+   return res;
+}
+
 std::vector<llvm::Value*> InstructionsSoa::dp3(const std::vector<llvm::Value*> in1,
                                                const std::vector<llvm::Value*> in2)
 {
@@ -281,6 +264,59 @@ std::vector<llvm::Value*> InstructionsSoa::dp3(const std::vector<llvm::Value*> i
    return callBuiltin(func, in1, in2);
 }
 
+std::vector<llvm::Value*> InstructionsSoa::lit(const std::vector<llvm::Value*> in)
+{
+   llvm::Function *func = function(TGSI_OPCODE_LIT);
+   return callBuiltin(func, in);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::madd(const std::vector<llvm::Value*> in1,
+                                                const std::vector<llvm::Value*> in2,
+                                                const std::vector<llvm::Value*> in3)
+{
+   std::vector<llvm::Value*> res = mul(in1, in2);
+   return add(res, in3);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::max(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   llvm::Function *func = function(TGSI_OPCODE_MAX);
+   return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::min(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   llvm::Function *func = function(TGSI_OPCODE_MIN);
+   return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::mul(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   std::vector<llvm::Value*> res(4);
+
+   res[0] = m_builder.CreateMul(in1[0], in2[0], name("mulx"));
+   res[1] = m_builder.CreateMul(in1[1], in2[1], name("muly"));
+   res[2] = m_builder.CreateMul(in1[2], in2[2], name("mulz"));
+   res[3] = m_builder.CreateMul(in1[3], in2[3], name("mulw"));
+
+   return res;
+}
+
+std::vector<llvm::Value*> InstructionsSoa::pow(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   llvm::Function *func = function(TGSI_OPCODE_POWER);
+   return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::rsq(const std::vector<llvm::Value*> in)
+{
+   llvm::Function *func = function(TGSI_OPCODE_RSQ);
+   return callBuiltin(func, in);
+}
 
 std::vector<llvm::Value*> InstructionsSoa::slt(const std::vector<llvm::Value*> in1,
                                                const std::vector<llvm::Value*> in2)
@@ -289,6 +325,37 @@ std::vector<llvm::Value*> InstructionsSoa::slt(const std::vector<llvm::Value*> i
    return callBuiltin(func, in1, in2);
 }
 
+std::vector<llvm::Value*> InstructionsSoa::sub(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   std::vector<llvm::Value*> res(4);
+
+   res[0] = m_builder.CreateSub(in1[0], in2[0], name("subx"));
+   res[1] = m_builder.CreateSub(in1[1], in2[1], name("suby"));
+   res[2] = m_builder.CreateSub(in1[2], in2[2], name("subz"));
+   res[3] = m_builder.CreateSub(in1[3], in2[3], name("subw"));
+
+   return res;
+}
+
+void checkFunction(Function *func)
+{
+   for (Function::const_iterator BI = func->begin(), BE = func->end();
+        BI != BE; ++BI) {
+      const BasicBlock &BB = *BI;
+      for (BasicBlock::const_iterator II = BB.begin(), IE = BB.end();
+           II != IE; ++II) {
+         const Instruction &I = *II;
+         std::cout<< "Instr = "<<I;
+         for (unsigned op = 0, E = I.getNumOperands(); op != E; ++op) {
+            const Value *Op = I.getOperand(op);
+            std::cout<< "\top = "<<Op<<"("<<op<<")"<<std::endl;
+            //I->setOperand(op, V);
+  }
+      }
+   }
+}
+
 llvm::Value * InstructionsSoa::allocaTemp()
 {
    VectorType *vector   = VectorType::get(Type::FloatTy, 4);
@@ -408,46 +475,6 @@ std::vector<Value*> InstructionsSoa::callBuiltin(llvm::Function *func, const std
    return allocaToResult(allocaPtr);
 }
 
-std::vector<llvm::Value*> InstructionsSoa::pow(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   llvm::Function *func = function(TGSI_OPCODE_POWER);
-   return callBuiltin(func, in1, in2);
-}
-
-std::vector<llvm::Value*> InstructionsSoa::min(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   llvm::Function *func = function(TGSI_OPCODE_MIN);
-   return callBuiltin(func, in1, in2);
-}
-
-
-std::vector<llvm::Value*> InstructionsSoa::max(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   llvm::Function *func = function(TGSI_OPCODE_MAX);
-   return callBuiltin(func, in1, in2);
-}
-
-void checkFunction(Function *func)
-{
-   for (Function::const_iterator BI = func->begin(), BE = func->end();
-        BI != BE; ++BI) {
-      const BasicBlock &BB = *BI;
-      for (BasicBlock::const_iterator II = BB.begin(), IE = BB.end();
-           II != IE; ++II) {
-         const Instruction &I = *II;
-         std::cout<< "Instr = "<<I;
-         for (unsigned op = 0, E = I.getNumOperands(); op != E; ++op) {
-            const Value *Op = I.getOperand(op);
-            std::cout<< "\top = "<<Op<<"("<<op<<")"<<std::endl;
-            //I->setOperand(op, V);
-  }
-      }
-   }
-}
-
 void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op)
 {
    assert(originalFunc);
@@ -492,28 +519,4 @@ void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op)
    }
 }
 
-std::vector<llvm::Value*> InstructionsSoa::sub(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   std::vector<llvm::Value*> res(4);
-
-   res[0] = m_builder.CreateSub(in1[0], in2[0], name("subx"));
-   res[1] = m_builder.CreateSub(in1[1], in2[1], name("suby"));
-   res[2] = m_builder.CreateSub(in1[2], in2[2], name("subz"));
-   res[3] = m_builder.CreateSub(in1[3], in2[3], name("subw"));
-
-   return res;
-}
-
-std::vector<llvm::Value*> InstructionsSoa::lit(const std::vector<llvm::Value*> in)
-{
-   llvm::Function *func = function(TGSI_OPCODE_LIT);
-   return callBuiltin(func, in);
-}
-
-std::vector<llvm::Value*> InstructionsSoa::rsq(const std::vector<llvm::Value*> in)
-{
-   llvm::Function *func = function(TGSI_OPCODE_RSQ);
-   return callBuiltin(func, in);
-}
 
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.h b/src/gallium/auxiliary/gallivm/instructionssoa.h
index 3817fdc904b..d6831e0a6b9 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.h
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.h
@@ -76,6 +76,7 @@ public:
    void         end();
 
    std::vector<llvm::Value*> extractVector(llvm::Value *vector);
+   llvm::IRBuilder<>*  getIRBuilder();
 private:
    const char * name(const char *prefix) const;
    llvm::Value *vectorFromVals(llvm::Value *x, llvm::Value *y,
diff --git a/src/gallium/auxiliary/gallivm/storage.cpp b/src/gallium/auxiliary/gallivm/storage.cpp
index 6f373f6dd5e..73df24c9769 100644
--- a/src/gallium/auxiliary/gallivm/storage.cpp
+++ b/src/gallium/auxiliary/gallivm/storage.cpp
@@ -323,7 +323,7 @@ llvm::Value * Storage::elemIdx(llvm::Value *ptr, int idx,
 
    if (indIdx) {
       getElem = GetElementPtrInst::Create(ptr,
-                                      BinaryOperator::create(Instruction::Add,
+                                      BinaryOperator::Create(Instruction::Add,
                                                              indIdx,
                                                              constantInt(idx),
                                                              name("add"),
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.cpp b/src/gallium/auxiliary/gallivm/storagesoa.cpp
index 78d754371f0..e1e5cabcf55 100644
--- a/src/gallium/auxiliary/gallivm/storagesoa.cpp
+++ b/src/gallium/auxiliary/gallivm/storagesoa.cpp
@@ -48,13 +48,11 @@ using namespace llvm;
 StorageSoa::StorageSoa(llvm::BasicBlock *block,
                        llvm::Value *input,
                        llvm::Value *output,
-                       llvm::Value *consts,
-                       llvm::Value *temps)
+                       llvm::Value *consts)
    : m_block(block),
      m_input(input),
      m_output(output),
      m_consts(consts),
-     m_temps(temps),
      m_immediates(0),
      m_idx(0)
 {
@@ -93,7 +91,7 @@ void StorageSoa::declareImmediates()
       std::vector<float> vals(4);
       std::vector<Constant*> channelArray;
 
-      vals[0] = vec[0]; vals[1] = vec[0]; vals[2] = vec[0]; vals[3] = vec[0];
+      vals[0] = vec[0]; vals[1] = vec[1]; vals[2] = vec[2]; vals[3] = vec[3];
       llvm::Constant *xChannel = createConstGlobalVector(vals);
 
       vals[0] = vec[1]; vals[1] = vec[1]; vals[2] = vec[1]; vals[3] = vec[1];
@@ -144,22 +142,43 @@ std::vector<llvm::Value*> StorageSoa::inputElement(llvm::Value *idx)
    return res;
 }
 
-std::vector<llvm::Value*> StorageSoa::constElement(llvm::Value *idx)
+llvm::Value* StorageSoa::unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::Value* vector, int cc)
 {
-   std::vector<llvm::Value*> res(4);
-   llvm::Value *xChannel, *yChannel, *zChannel, *wChannel;
+   std::vector<llvm::Value*> x(4);
+   x[0] = m_builder->CreateExtractElement(vector,
+                                           constantInt(cc),
+                                           name("x"));
+
+   VectorType  *vectorType = VectorType::get(Type::FloatTy, 4);
+   Constant *constVector = Constant::getNullValue(vectorType);
+   Value *res = m_builder->CreateInsertElement(constVector, x[0],
+                                              constantInt(0),
+                                              name("vecx"));
+   res = m_builder->CreateInsertElement(res, x[0], constantInt(1),
+                               name("vecxx"));
+   res = m_builder->CreateInsertElement(res, x[0], constantInt(2),
+                               name("vecxxx"));
+   res = m_builder->CreateInsertElement(res, x[0], constantInt(3),
+                               name("vecxxxx"));
+   return res;
+}
+
+std::vector<llvm::Value*> StorageSoa::constElement(llvm::IRBuilder<>* m_builder, llvm::Value *idx)
+{
+   llvm::Value* res;
+   std::vector<llvm::Value*> res2(4);
+   llvm::Value *xChannel;
 
    xChannel = elementPointer(m_consts, idx, 0);
-   yChannel = elementPointer(m_consts, idx, 1);
-   zChannel = elementPointer(m_consts, idx, 2);
-   wChannel = elementPointer(m_consts, idx, 3);
 
-   res[0] = alignedArrayLoad(xChannel);
-   res[1] = alignedArrayLoad(yChannel);
-   res[2] = alignedArrayLoad(zChannel);
-   res[3] = alignedArrayLoad(wChannel);
+   res = alignedArrayLoad(xChannel);
 
-   return res;
+   res2[0]=unpackConstElement(m_builder, res,0);
+   res2[1]=unpackConstElement(m_builder, res,1);
+   res2[2]=unpackConstElement(m_builder, res,2);
+   res2[3]=unpackConstElement(m_builder, res,3);
+
+   return res2;
 }
 
 std::vector<llvm::Value*> StorageSoa::outputElement(llvm::Value *idx)
@@ -174,14 +193,15 @@ std::vector<llvm::Value*> StorageSoa::outputElement(llvm::Value *idx)
    return res;
 }
 
-std::vector<llvm::Value*> StorageSoa::tempElement(llvm::Value *idx)
+std::vector<llvm::Value*> StorageSoa::tempElement(llvm::IRBuilder<>* m_builder, int idx)
 {
    std::vector<llvm::Value*> res(4);
+   llvm::Value *temp = m_temps[idx];
 
-   res[0] = element(m_temps, idx, 0);
-   res[1] = element(m_temps, idx, 1);
-   res[2] = element(m_temps, idx, 2);
-   res[3] = element(m_temps, idx, 3);
+   res[0] = element(temp, constantInt(0), 0);
+   res[1] = element(temp, constantInt(0), 1);
+   res[2] = element(temp, constantInt(0), 2);
+   res[3] = element(temp, constantInt(0), 3);
 
    return res;
 }
@@ -260,6 +280,12 @@ llvm::Module * StorageSoa::currentModule() const
     return m_block->getParent()->getParent();
 }
 
+llvm::Constant * StorageSoa::createConstGlobalFloat(const float val)
+{
+   Constant*c = ConstantFP::get(APFloat(val));
+   return c;
+}
+
 llvm::Constant * StorageSoa::createConstGlobalVector(const std::vector<float> &vec)
 {
    VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
@@ -278,7 +304,7 @@ llvm::Constant * StorageSoa::createConstGlobalVector(const std::vector<float> &v
 }
 
 std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, int swizzle,
-                                           llvm::Value *indIdx)
+                                           llvm::IRBuilder<>* m_builder,llvm::Value *indIdx)
 {
    std::vector<llvm::Value*> val(4);
 
@@ -299,10 +325,10 @@ std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, in
       val = outputElement(realIndex);
       break;
    case TGSI_FILE_TEMPORARY:
-      val = tempElement(realIndex);
+      val = tempElement(m_builder, idx);
       break;
    case TGSI_FILE_CONSTANT:
-      val = constElement(realIndex);
+      val = constElement(m_builder, realIndex);
       break;
    case TGSI_FILE_IMMEDIATE:
       val = immediateElement(realIndex);
@@ -328,19 +354,39 @@ std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, in
    return res;
 }
 
+llvm::Value * StorageSoa::allocaTemp(llvm::IRBuilder<>* m_builder)
+{
+   VectorType *vector   = VectorType::get(Type::FloatTy, 4);
+   ArrayType  *vecArray = ArrayType::get(vector, 4);
+   AllocaInst *alloca = new AllocaInst(vecArray, "temp",
+                                       m_builder->GetInsertBlock());
+
+   return alloca;
+}
+
+
 void StorageSoa::store(enum tgsi_file_type type, int idx, const std::vector<llvm::Value*> &val,
-                       int mask)
+                       int mask, llvm::IRBuilder<>* m_builder)
 {
    llvm::Value *out = 0;
+   llvm::Value *realIndex = 0;
    switch(type) {
    case TGSI_FILE_OUTPUT:
       out = m_output;
+      realIndex = constantInt(idx);
       break;
    case TGSI_FILE_TEMPORARY:
-      out = m_temps;
+      // if that temp doesn't already exist, alloca it
+      if (m_temps.find(idx) == m_temps.end())
+         m_temps[idx] = allocaTemp(m_builder);
+
+      out = m_temps[idx];
+
+      realIndex = constantInt(0);
       break;
    case TGSI_FILE_INPUT:
       out = m_input;
+      realIndex = constantInt(idx);
       break;
    case TGSI_FILE_ADDRESS: {
       llvm::Value *addr = m_addresses[idx];
@@ -358,7 +404,6 @@ void StorageSoa::store(enum tgsi_file_type type, int idx, const std::vector<llvm
       assert(0);
       break;
    }
-   llvm::Value *realIndex = constantInt(idx);
    if ((mask & TGSI_WRITEMASK_X)) {
       llvm::Value *xChannel = elementPointer(out, realIndex, 0);
       new StoreInst(val[0], xChannel, false, m_block);
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.h b/src/gallium/auxiliary/gallivm/storagesoa.h
index ae2fc7c6aee..56886f85e7a 100644
--- a/src/gallium/auxiliary/gallivm/storagesoa.h
+++ b/src/gallium/auxiliary/gallivm/storagesoa.h
@@ -29,6 +29,7 @@
 #define STORAGESOA_H
 
 #include <pipe/p_shader_tokens.h>
+#include <llvm/Support/IRBuilder.h>
 
 #include <vector>
 #include <list>
@@ -51,14 +52,13 @@ public:
    StorageSoa(llvm::BasicBlock *block,
               llvm::Value *input,
               llvm::Value *output,
-              llvm::Value *consts,
-              llvm::Value *temps);
+              llvm::Value *consts);
 
 
    std::vector<llvm::Value*> load(enum tgsi_file_type type, int idx, int swizzle, 
-                                  llvm::Value *indIdx =0);
+                                  llvm::IRBuilder<>* m_builder, llvm::Value *indIdx =0);
    void store(enum tgsi_file_type type, int idx, const std::vector<llvm::Value*> &val,
-              int mask);
+              int mask, llvm::IRBuilder<>* m_builder);
 
    void addImmediate(float *vec);
    void declareImmediates();
@@ -76,12 +76,14 @@ private:
    const char *name(const char *prefix) const;
    llvm::Value  *alignedArrayLoad(llvm::Value *val);
    llvm::Module *currentModule() const;
+   llvm::Constant  *createConstGlobalFloat(const float val);
    llvm::Constant  *createConstGlobalVector(const std::vector<float> &vec);
 
    std::vector<llvm::Value*> inputElement(llvm::Value *indIdx);
-   std::vector<llvm::Value*> constElement(llvm::Value *indIdx);
+   llvm::Value* unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx, int cc);
+   std::vector<llvm::Value*> constElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx);
    std::vector<llvm::Value*> outputElement(llvm::Value *indIdx);
-   std::vector<llvm::Value*> tempElement(llvm::Value *indIdx);
+   std::vector<llvm::Value*> tempElement(llvm::IRBuilder<>* m_builder, int idx);
    std::vector<llvm::Value*> immediateElement(llvm::Value *indIdx);
 private:
    llvm::BasicBlock *m_block;
@@ -89,12 +91,13 @@ private:
    llvm::Value *m_input;
    llvm::Value *m_output;
    llvm::Value *m_consts;
-   llvm::Value *m_temps;
+   std::map<int, llvm::Value*> m_temps;
    llvm::GlobalVariable *m_immediates;
 
    std::map<int, llvm::Value*> m_addresses;
 
    std::vector<std::vector<float> > m_immediatesToFlush;
+   llvm::Value * allocaTemp(llvm::IRBuilder<>* m_builder);
 
    mutable std::map<int, llvm::ConstantInt*> m_constInts;
    mutable char        m_name[32];
diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
index 7292c0e366c..c11b88af9ec 100644
--- a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
+++ b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
@@ -52,8 +52,7 @@ static inline FunctionType *vertexShaderFunctionType()
    // pass are castable to the following:
    // [4 x <4 x float>] inputs,
    // [4 x <4 x float>] output,
-   // [4 x [4 x float]] consts,
-   // [4 x <4 x float>] temps
+   // [4 x [1 x float]] consts,
 
    std::vector<const Type*> funcArgs;
    VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
@@ -61,13 +60,12 @@ static inline FunctionType *vertexShaderFunctionType()
    PointerType *vectorArrayPtr = PointerType::get(vectorArray, 0);
 
    ArrayType   *floatArray     = ArrayType::get(Type::FloatTy, 4);
-   ArrayType   *constsArray    = ArrayType::get(floatArray, 4);
+   ArrayType   *constsArray    = ArrayType::get(floatArray, 1);
    PointerType *constsArrayPtr = PointerType::get(constsArray, 0);
 
    funcArgs.push_back(vectorArrayPtr);//inputs
    funcArgs.push_back(vectorArrayPtr);//output
    funcArgs.push_back(constsArrayPtr);//consts
-   funcArgs.push_back(vectorArrayPtr);//temps
 
    FunctionType *functionType = FunctionType::get(
       /*Result=*/Type::VoidTy,
@@ -707,9 +705,8 @@ translate_instructionir(llvm::Module *module,
       if (src->SrcRegister.Indirect) {
          indIdx = storage->addrElement(src->SrcRegisterInd.Index);
       }
-
       val = storage->load((enum tgsi_file_type)src->SrcRegister.File,
-                          src->SrcRegister.Index, swizzle, indIdx);
+                          src->SrcRegister.Index, swizzle, instr->getIRBuilder(), indIdx);
 
       inputs[i] = val;
    }
@@ -1025,9 +1022,9 @@ translate_instructionir(llvm::Module *module,
    /* store results  */
    for (int i = 0; i < inst->Instruction.NumDstRegs; ++i) {
       struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
-
       storage->store((enum tgsi_file_type)dst->DstRegister.File,
-                     dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
+                     dst->DstRegister.Index, out, dst->DstRegister.WriteMask,
+		     instr->getIRBuilder() );
    }
 }
 
@@ -1122,8 +1119,6 @@ llvm::Module * tgsi_to_llvmir(struct gallivm_ir *ir,
    output->setName("outputs");
    Value *consts = args++;
    consts->setName("consts");
-   Value *temps = args++;
-   temps->setName("temps");
 
    BasicBlock *label_entry = BasicBlock::Create("entry", shader, 0);
 
@@ -1132,7 +1127,7 @@ llvm::Module * tgsi_to_llvmir(struct gallivm_ir *ir,
    fi = tgsi_default_full_instruction();
    fd = tgsi_default_full_declaration();
 
-   StorageSoa storage(label_entry, input, output, consts, temps);
+   StorageSoa storage(label_entry, input, output, consts);
    InstructionsSoa instr(mod, shader, label_entry, &storage);
 
    while(!tgsi_parse_end_of_tokens(&parse)) {
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer.h b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
index c9570d7be2a..dd22ef34ecc 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
@@ -217,6 +217,7 @@ pb_destroy(struct pb_buffer *buf)
    assert(buf);
    if(!buf)
       return;
+   assert(buf->base.refcount == 0);
    buf->vtbl->destroy(buf);
 }
 
@@ -227,11 +228,16 @@ static INLINE void
 pb_reference(struct pb_buffer **dst,
              struct pb_buffer *src)
 {
-   if (src) 
+   if (src) {
+      assert(src->base.refcount);
       src->base.refcount++;
+   }
 
-   if (*dst && --(*dst)->base.refcount == 0)
-      pb_destroy( *dst );
+   if (*dst) {
+      assert((*dst)->base.refcount);
+      if(--(*dst)->base.refcount == 0)
+         pb_destroy( *dst );
+   }
 
    *dst = src;
 }
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
index f8fd2cd531e..513ed28ca63 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
@@ -86,8 +86,7 @@ fenced_bufmgr_create_buffer(struct pb_manager *mgr,
    
    fenced_buf = fenced_buffer_create(fenced_mgr->fenced_list, buf);
    if(!fenced_buf) {
-      assert(buf->base.refcount == 1);
-      pb_destroy(buf);
+      pb_reference(&buf, NULL);
    }
    
    return fenced_buf;
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
index 607f10bc732..fd39838bfdf 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
@@ -100,7 +100,7 @@ mm_buffer_destroy(struct pb_buffer *buf)
    assert(buf->base.refcount == 0);
    
    pipe_mutex_lock(mm->mutex);
-   mmFreeMem(mm_buf->block);
+   u_mmFreeMem(mm_buf->block);
    FREE(buf);
    pipe_mutex_unlock(mm->mutex);
 }
@@ -198,14 +198,14 @@ mm_bufmgr_create_buffer(struct pb_manager *mgr,
    
    mm_buf->mgr = mm;
    
-   mm_buf->block = mmAllocMem(mm->heap, size, mm->align2, 0);
+   mm_buf->block = u_mmAllocMem(mm->heap, size, mm->align2, 0);
    if(!mm_buf->block) {
       debug_printf("warning: heap full\n");
 #if 0
       mmDumpMemInfo(mm->heap);
 #endif
       
-      mm_buf->block = mmAllocMem(mm->heap, size, mm->align2, 0);
+      mm_buf->block = u_mmAllocMem(mm->heap, size, mm->align2, 0);
       if(!mm_buf->block) {
          FREE(mm_buf);
          pipe_mutex_unlock(mm->mutex);
@@ -236,7 +236,7 @@ mm_bufmgr_destroy(struct pb_manager *mgr)
    
    pipe_mutex_lock(mm->mutex);
 
-   mmDestroy(mm->heap);
+   u_mmDestroy(mm->heap);
    
    pb_unmap(mm->buffer);
    pb_reference(&mm->buffer, NULL);
@@ -277,7 +277,7 @@ mm_bufmgr_create_from_buffer(struct pb_buffer *buffer,
    if(!mm->map)
       goto failure;
 
-   mm->heap = mmInit(0, size); 
+   mm->heap = u_mmInit(0, size); 
    if (!mm->heap)
       goto failure;
 
@@ -285,7 +285,7 @@ mm_bufmgr_create_from_buffer(struct pb_buffer *buffer,
    
 failure:
 if(mm->heap)
-   mmDestroy(mm->heap);
+   u_mmDestroy(mm->heap);
    if(mm->map)
       pb_unmap(mm->buffer);
    if(mm)
diff --git a/src/gallium/auxiliary/rtasm/Makefile b/src/gallium/auxiliary/rtasm/Makefile
index 39b8a4dbd7a..252dc5274ab 100644
--- a/src/gallium/auxiliary/rtasm/Makefile
+++ b/src/gallium/auxiliary/rtasm/Makefile
@@ -7,6 +7,7 @@ C_SOURCES = \
 	rtasm_cpu.c \
 	rtasm_execmem.c \
 	rtasm_x86sse.c \
+	rtasm_ppc.c \
 	rtasm_ppc_spe.c
 
 include ../../Makefile.template
diff --git a/src/gallium/auxiliary/rtasm/SConscript b/src/gallium/auxiliary/rtasm/SConscript
index 8ea25922aa1..eb48368accb 100644
--- a/src/gallium/auxiliary/rtasm/SConscript
+++ b/src/gallium/auxiliary/rtasm/SConscript
@@ -6,6 +6,7 @@ rtasm = env.ConvenienceLibrary(
 		'rtasm_cpu.c',
 		'rtasm_execmem.c',
 		'rtasm_x86sse.c',
+		'rtasm_ppc.c',
 		'rtasm_ppc_spe.c',
 	])
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
index f16191cb619..be7433baf87 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -38,12 +38,13 @@
 #include "rtasm_execmem.h"
 
 
-#if defined(__linux__)
+#if defined(PIPE_OS_LINUX)
+
 
 /*
  * Allocate a large block of memory which can hold code then dole it out
  * in pieces by means of the generic memory manager code.
-*/
+ */
 
 #include <unistd.h>
 #include <sys/mman.h>
@@ -62,7 +63,7 @@ static void
 init_heap(void)
 {
    if (!exec_heap)
-      exec_heap = mmInit( 0, EXEC_HEAP_SIZE );
+      exec_heap = u_mmInit( 0, EXEC_HEAP_SIZE );
    
    if (!exec_mem)
       exec_mem = (unsigned char *) mmap(0, EXEC_HEAP_SIZE, 
@@ -83,7 +84,7 @@ rtasm_exec_malloc(size_t size)
 
    if (exec_heap) {
       size = (size + 31) & ~31;  /* next multiple of 32 bytes */
-      block = mmAllocMem( exec_heap, size, 5, 0 ); /* 5 -> 32-byte alignment */
+      block = u_mmAllocMem( exec_heap, size, 5, 0 ); /* 5 -> 32-byte alignment */
    }
 
    if (block)
@@ -103,17 +104,17 @@ rtasm_exec_free(void *addr)
    pipe_mutex_lock(exec_mutex);
 
    if (exec_heap) {
-      struct mem_block *block = mmFindBlock(exec_heap, (unsigned char *)addr - exec_mem);
+      struct mem_block *block = u_mmFindBlock(exec_heap, (unsigned char *)addr - exec_mem);
    
       if (block)
-	 mmFreeMem(block);
+	 u_mmFreeMem(block);
    }
 
    pipe_mutex_unlock(exec_mutex);
 }
 
 
-#else
+#else /* PIPE_OS_LINUX */
 
 /*
  * Just use regular memory.
@@ -133,4 +134,4 @@ rtasm_exec_free(void *addr)
 }
 
 
-#endif
+#endif /* PIPE_OS_LINUX */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
new file mode 100644
index 00000000000..b65bfa7bbdf
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -0,0 +1,959 @@
+/**************************************************************************
+ *
+ * Copyright (C) 2008 Tungsten Graphics, Inc.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * PPC code generation.
+ * For reference, see http://www.power.org/resources/reading/PowerISA_V2.05.pdf
+ * ABI info: http://www.cs.utsa.edu/~whaley/teach/cs6463FHPO/LEC/lec12_ho.pdf
+ *
+ * Other PPC refs:
+ * http://www-01.ibm.com/chips/techlib/techlib.nsf/techdocs/852569B20050FF778525699600719DF2
+ * http://www.ibm.com/developerworks/eserver/library/es-archguide-v2.html
+ * http://www.freescale.com/files/product/doc/MPCFPE32B.pdf
+ *
+ * \author Brian Paul
+ */
+
+
+#include <stdio.h>
+#include "util/u_memory.h"
+#include "pipe/p_debug.h"
+#include "rtasm_execmem.h"
+#include "rtasm_ppc.h"
+
+
+void
+ppc_init_func(struct ppc_function *p)
+{
+   uint i;
+
+   p->num_inst = 0;
+   p->max_inst = 100; /* first guess at buffer size */
+   p->store = rtasm_exec_malloc(p->max_inst * PPC_INST_SIZE);
+   p->reg_used = 0x0;
+   p->fp_used = 0x0;
+   p->vec_used = 0x0;
+
+   /* only allow using gp registers 3..12 for now */
+   for (i = 0; i < 3; i++)
+      ppc_reserve_register(p, i);
+   for (i = 12; i < PPC_NUM_REGS; i++)
+      ppc_reserve_register(p, i);
+}
+
+
+void
+ppc_release_func(struct ppc_function *p)
+{
+   assert(p->num_inst <= p->max_inst);
+   if (p->store != NULL) {
+      rtasm_exec_free(p->store);
+   }
+   p->store = NULL;
+}
+
+
+uint
+ppc_num_instructions(const struct ppc_function *p)
+{
+   return p->num_inst;
+}
+
+
+void (*ppc_get_func(struct ppc_function *p))(void)
+{
+#if 0
+   DUMP_END();
+   if (DISASSEM && p->store)
+      debug_printf("disassemble %p %p\n", p->store, p->csr);
+
+   if (p->store == p->error_overflow)
+      return (void (*)(void)) NULL;
+   else
+#endif
+      return (void (*)(void)) p->store;
+}
+
+
+void
+ppc_dump_func(const struct ppc_function *p)
+{
+   uint i;
+   for (i = 0; i < p->num_inst; i++) {
+      debug_printf("%3u: 0x%08x\n", i, p->store[i]);
+   }
+}
+
+
+/**
+ * Mark a register as being unavailable.
+ */
+int
+ppc_reserve_register(struct ppc_function *p, int reg)
+{
+   assert(reg < PPC_NUM_REGS);
+   p->reg_used |= (1 << reg);
+   return reg;
+}
+
+
+/**
+ * Allocate a general purpose register.
+ * \return register index or -1 if none left.
+ */
+int
+ppc_allocate_register(struct ppc_function *p)
+{
+   unsigned i;
+   for (i = 0; i < PPC_NUM_REGS; i++) {
+      const uint64_t mask = 1 << i;
+      if ((p->reg_used & mask) == 0) {
+         p->reg_used |= mask;
+         return i;
+      }
+   }
+   return -1;
+}
+
+
+/**
+ * Mark the given general purpose register as "unallocated".
+ */
+void
+ppc_release_register(struct ppc_function *p, int reg)
+{
+   assert(reg < PPC_NUM_REGS);
+   assert(p->reg_used & (1 << reg));
+   p->reg_used &= ~(1 << reg);
+}
+
+
+/**
+ * Allocate a floating point register.
+ * \return register index or -1 if none left.
+ */
+int
+ppc_allocate_fp_register(struct ppc_function *p)
+{
+   unsigned i;
+   for (i = 0; i < PPC_NUM_FP_REGS; i++) {
+      const uint64_t mask = 1 << i;
+      if ((p->fp_used & mask) == 0) {
+         p->fp_used |= mask;
+         return i;
+      }
+   }
+   return -1;
+}
+
+
+/**
+ * Mark the given floating point register as "unallocated".
+ */
+void
+ppc_release_fp_register(struct ppc_function *p, int reg)
+{
+   assert(reg < PPC_NUM_FP_REGS);
+   assert(p->fp_used & (1 << reg));
+   p->fp_used &= ~(1 << reg);
+}
+
+
+/**
+ * Allocate a vector register.
+ * \return register index or -1 if none left.
+ */
+int
+ppc_allocate_vec_register(struct ppc_function *p)
+{
+   unsigned i;
+   for (i = 0; i < PPC_NUM_VEC_REGS; i++) {
+      const uint64_t mask = 1 << i;
+      if ((p->vec_used & mask) == 0) {
+         p->vec_used |= mask;
+         return i;
+      }
+   }
+   return -1;
+}
+
+
+/**
+ * Mark the given vector register as "unallocated".
+ */
+void
+ppc_release_vec_register(struct ppc_function *p, int reg)
+{
+   assert(reg < PPC_NUM_VEC_REGS);
+   assert(p->vec_used & (1 << reg));
+   p->vec_used &= ~(1 << reg);
+}
+
+
+/**
+ * Append instruction to instruction buffer.  Grow buffer if out of room.
+ */
+static void
+emit_instruction(struct ppc_function *p, uint32_t inst_bits)
+{
+   if (!p->store)
+      return;  /* out of memory, drop the instruction */
+
+   if (p->num_inst == p->max_inst) {
+      /* allocate larger buffer */
+      uint32_t *newbuf;
+      p->max_inst *= 2;  /* 2x larger */
+      newbuf = rtasm_exec_malloc(p->max_inst * PPC_INST_SIZE);
+      if (newbuf) {
+         memcpy(newbuf, p->store, p->num_inst * PPC_INST_SIZE);
+      }
+      rtasm_exec_free(p->store);
+      p->store = newbuf;
+      if (!p->store) {
+         /* out of memory */
+         p->num_inst = 0;
+         return;
+      }
+   }
+
+   p->store[p->num_inst++] = inst_bits;
+}
+
+
+union vx_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned vD:5;
+      unsigned vA:5;
+      unsigned vB:5;
+      unsigned op2:11;
+   } inst;
+};
+
+static INLINE void
+emit_vx(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+{
+   union vx_inst inst;
+   inst.inst.op = 4;
+   inst.inst.vD = vD;
+   inst.inst.vA = vA;
+   inst.inst.vB = vB;
+   inst.inst.op2 = op2;
+   emit_instruction(p, inst.bits);
+};
+
+
+union vxr_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned vD:5;
+      unsigned vA:5;
+      unsigned vB:5;
+      unsigned rC:1;
+      unsigned op2:10;
+   } inst;
+};
+
+static INLINE void
+emit_vxr(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+{
+   union vxr_inst inst;
+   inst.inst.op = 4;
+   inst.inst.vD = vD;
+   inst.inst.vA = vA;
+   inst.inst.vB = vB;
+   inst.inst.rC = 0;
+   inst.inst.op2 = op2;
+   emit_instruction(p, inst.bits);
+};
+
+
+union va_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned vD:5;
+      unsigned vA:5;
+      unsigned vB:5;
+      unsigned vC:5;
+      unsigned op2:6;
+   } inst;
+};
+
+static INLINE void
+emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC)
+{
+   union va_inst inst;
+   inst.inst.op = 4;
+   inst.inst.vD = vD;
+   inst.inst.vA = vA;
+   inst.inst.vB = vB;
+   inst.inst.vC = vC;
+   inst.inst.op2 = op2;
+   emit_instruction(p, inst.bits);
+};
+
+
+union i_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned li:24;
+      unsigned aa:1;
+      unsigned lk:1;
+   } inst;
+};
+
+static INLINE void
+emit_i(struct ppc_function *p, uint op, uint li, uint aa, uint lk)
+{
+   union i_inst inst;
+   inst.inst.op = op;
+   inst.inst.li = li;
+   inst.inst.aa = aa;
+   inst.inst.lk = lk;
+   emit_instruction(p, inst.bits);
+}
+
+
+union xl_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned bo:5;
+      unsigned bi:5;
+      unsigned unused:3;
+      unsigned bh:2;
+      unsigned op2:10;
+      unsigned lk:1;
+   } inst;
+};
+
+static INLINE void
+emit_xl(struct ppc_function *p, uint op, uint bo, uint bi, uint bh,
+        uint op2, uint lk)
+{
+   union xl_inst inst;
+   inst.inst.op = op;
+   inst.inst.bo = bo;
+   inst.inst.bi = bi;
+   inst.inst.unused = 0x0;
+   inst.inst.bh = bh;
+   inst.inst.op2 = op2;
+   inst.inst.lk = lk;
+   emit_instruction(p, inst.bits);
+}
+
+static INLINE void
+dump_xl(const char *name, uint inst)
+{
+   union xl_inst i;
+
+   i.bits = inst;
+   debug_printf("%s = 0x%08x\n", name, inst);
+   debug_printf(" op: %d 0x%x\n", i.inst.op, i.inst.op);
+   debug_printf(" bo: %d 0x%x\n", i.inst.bo, i.inst.bo);
+   debug_printf(" bi: %d 0x%x\n", i.inst.bi, i.inst.bi);
+   debug_printf(" unused: %d 0x%x\n", i.inst.unused, i.inst.unused);
+   debug_printf(" bh: %d 0x%x\n", i.inst.bh, i.inst.bh);
+   debug_printf(" op2: %d 0x%x\n", i.inst.op2, i.inst.op2);
+   debug_printf(" lk: %d 0x%x\n", i.inst.lk, i.inst.lk);
+}
+
+
+union x_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned vrs:5;
+      unsigned ra:5;
+      unsigned rb:5;
+      unsigned op2:10;
+      unsigned unused:1;
+   } inst;
+};
+
+static INLINE void
+emit_x(struct ppc_function *p, uint op, uint vrs, uint ra, uint rb, uint op2)
+{
+   union x_inst inst;
+   inst.inst.op = op;
+   inst.inst.vrs = vrs;
+   inst.inst.ra = ra;
+   inst.inst.rb = rb;
+   inst.inst.op2 = op2;
+   inst.inst.unused = 0x0;
+   emit_instruction(p, inst.bits);
+}
+
+
+union d_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned rt:5;
+      unsigned ra:5;
+      unsigned si:16;
+   } inst;
+};
+
+static INLINE void
+emit_d(struct ppc_function *p, uint op, uint rt, uint ra, int si)
+{
+   union d_inst inst;
+   assert(si >= -32768);
+   assert(si <= 32767);
+   inst.inst.op = op;
+   inst.inst.rt = rt;
+   inst.inst.ra = ra;
+   inst.inst.si = (unsigned) (si & 0xffff);
+   emit_instruction(p, inst.bits);
+};
+
+
+union a_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned frt:5;
+      unsigned fra:5;
+      unsigned frb:5;
+      unsigned unused:5;
+      unsigned op2:5;
+      unsigned rc:1;
+   } inst;
+};
+
+static INLINE void
+emit_a(struct ppc_function *p, uint op, uint frt, uint fra, uint frb, uint op2,
+       uint rc)
+{
+   union a_inst inst;
+   inst.inst.op = op;
+   inst.inst.frt = frt;
+   inst.inst.fra = fra;
+   inst.inst.frb = frb;
+   inst.inst.unused = 0x0;
+   inst.inst.op2 = op2;
+   inst.inst.rc = rc;
+   emit_instruction(p, inst.bits);
+};
+
+
+union xo_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned rt:5;
+      unsigned ra:5;
+      unsigned rb:5;
+      unsigned oe:1;
+      unsigned op2:9;
+      unsigned rc:1;
+   } inst;
+};
+
+static INLINE void
+emit_xo(struct ppc_function *p, uint op, uint rt, uint ra, uint rb, uint oe,
+        uint op2, uint rc)
+{
+   union xo_inst inst;
+   inst.inst.op = op;
+   inst.inst.rt = rt;
+   inst.inst.ra = ra;
+   inst.inst.rb = rb;
+   inst.inst.oe = oe;
+   inst.inst.op2 = op2;
+   inst.inst.rc = rc;
+   emit_instruction(p, inst.bits);
+}
+
+
+
+
+
+/**
+ ** float vector arithmetic
+ **/
+
+/** vector float add */
+void
+ppc_vaddfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 10, vD, vA, vB);
+}
+
+/** vector float substract */
+void
+ppc_vsubfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 74, vD, vA, vB);
+}
+
+/** vector float min */
+void
+ppc_vminfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1098, vD, vA, vB);
+}
+
+/** vector float max */
+void
+ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1034, vD, vA, vB);
+}
+
+/** vector float mult add: vD = vA * vB + vC */
+void
+ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+   emit_va(p, 46, vD, vA, vC, vB); /* note arg order */
+}
+
+/** vector float negative mult subtract: vD = vA - vB * vC */
+void
+ppc_vnmsubfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+   emit_va(p, 47, vD, vB, vA, vC); /* note arg order */
+}
+
+/** vector float compare greater than */
+void
+ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vxr(p, 710, vD, vA, vB);
+}
+
+/** vector float compare greater than or equal to */
+void
+ppc_vcmpgefpx(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vxr(p, 454, vD, vA, vB);
+}
+
+/** vector float compare equal */
+void
+ppc_vcmpeqfpx(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vxr(p, 198, vD, vA, vB);
+}
+
+/** vector float 2^x */
+void
+ppc_vexptefp(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 394, vD, 0, vB);
+}
+
+/** vector float log2(x) */
+void
+ppc_vlogefp(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 458, vD, 0, vB);
+}
+
+/** vector float reciprocol */
+void
+ppc_vrefp(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 266, vD, 0, vB);
+}
+
+/** vector float reciprocol sqrt estimate */
+void
+ppc_vrsqrtefp(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 330, vD, 0, vB);
+}
+
+/** vector float round to negative infinity */
+void
+ppc_vrfim(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 714, vD, 0, vB);
+}
+
+/** vector float round to positive infinity */
+void
+ppc_vrfip(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 650, vD, 0, vB);
+}
+
+/** vector float round to nearest int */
+void
+ppc_vrfin(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 522, vD, 0, vB);
+}
+
+/** vector float round to int toward zero */
+void
+ppc_vrfiz(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 586, vD, 0, vB);
+}
+
+/** vector store: store vR at mem[vA+vB] */
+void
+ppc_stvx(struct ppc_function *p, uint vR, uint vA, uint vB)
+{
+   emit_x(p, 31, vR, vA, vB, 231);
+}
+
+/** vector load: vR = mem[vA+vB] */
+void
+ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB)
+{
+   emit_x(p, 31, vR, vA, vB, 103);
+}
+
+/** load vector element word: vR = mem_word[ra+rb] */
+void
+ppc_lvewx(struct ppc_function *p, uint vr, uint ra, uint rb)
+{
+   emit_x(p, 31, vr, ra, rb, 71);
+}
+
+
+
+
+/**
+ ** vector bitwise operations
+ **/
+
+/** vector and */
+void
+ppc_vand(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1028, vD, vA, vB);
+}
+
+/** vector and complement */
+void
+ppc_vandc(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1092, vD, vA, vB);
+}
+
+/** vector or */
+void
+ppc_vor(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1156, vD, vA, vB);
+}
+
+/** vector nor */
+void
+ppc_vnor(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1284, vD, vA, vB);
+}
+
+/** vector xor */
+void
+ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1220, vD, vA, vB);
+}
+
+/** Pseudo-instruction: vector move */
+void
+ppc_vmove(struct ppc_function *p, uint vD, uint vA)
+{
+   ppc_vor(p, vD, vA, vA);
+}
+
+/** Set vector register to {0,0,0,0} */
+void
+ppc_vzero(struct ppc_function *p, uint vr)
+{
+   ppc_vxor(p, vr, vr, vr);
+}
+
+
+
+
+/**
+ ** Vector shuffle / select / splat / etc
+ **/
+
+/** vector permute */
+void
+ppc_vperm(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+   emit_va(p, 43, vD, vA, vB, vC);
+}
+
+/** vector select */
+void
+ppc_vsel(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+   emit_va(p, 42, vD, vA, vB, vC);
+}
+
+/** vector splat byte */
+void
+ppc_vspltb(struct ppc_function *p, uint vD, uint vB, uint imm)
+{
+   emit_vx(p, 42, vD, imm, vB);
+}
+
+/** vector splat half word */
+void
+ppc_vsplthw(struct ppc_function *p, uint vD, uint vB, uint imm)
+{
+   emit_vx(p, 588, vD, imm, vB);
+}
+
+/** vector splat word */
+void
+ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm)
+{
+   emit_vx(p, 652, vD, imm, vB);
+}
+
+/** vector splat signed immediate word */
+void
+ppc_vspltisw(struct ppc_function *p, uint vD, int imm)
+{
+   assert(imm >= -16);
+   assert(imm < 15);
+   emit_vx(p, 908, vD, imm, 0);
+}
+
+/** vector shift left word: vD[word] = vA[word] << (vB[word] & 0x1f) */
+void
+ppc_vslw(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 388, vD, vA, vB);
+}
+
+
+
+
+/**
+ ** integer arithmetic
+ **/
+
+/** rt = ra + imm */
+void
+ppc_addi(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 14, rt, ra, imm);
+}
+
+/** rt = ra + (imm << 16) */
+void
+ppc_addis(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 15, rt, ra, imm);
+}
+
+/** rt = ra + rb */
+void
+ppc_add(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+   emit_xo(p, 31, rt, ra, rb, 0, 266, 0);
+}
+
+/** rt = ra AND ra */
+void
+ppc_and(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+   emit_x(p, 31, ra, rt, rb, 28);  /* note argument order */
+}
+
+/** rt = ra AND imm */
+void
+ppc_andi(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 28, ra, rt, imm);  /* note argument order */
+}
+
+/** rt = ra OR ra */
+void
+ppc_or(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+   emit_x(p, 31, ra, rt, rb, 444);  /* note argument order */
+}
+
+/** rt = ra OR imm */
+void
+ppc_ori(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 24, ra, rt, imm);  /* note argument order */
+}
+
+/** rt = ra XOR ra */
+void
+ppc_xor(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+   emit_x(p, 31, ra, rt, rb, 316);  /* note argument order */
+}
+
+/** rt = ra XOR imm */
+void
+ppc_xori(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 26, ra, rt, imm);  /* note argument order */
+}
+
+/** pseudo instruction: move: rt = ra */
+void
+ppc_mr(struct ppc_function *p, uint rt, uint ra)
+{
+   ppc_or(p, rt, ra, ra);
+}
+
+/** pseudo instruction: load immediate: rt = imm */
+void
+ppc_li(struct ppc_function *p, uint rt, int imm)
+{
+   ppc_addi(p, rt, 0, imm);
+}
+
+/** rt = imm << 16 */
+void
+ppc_lis(struct ppc_function *p, uint rt, int imm)
+{
+   ppc_addis(p, rt, 0, imm);
+}
+
+/** rt = imm */
+void
+ppc_load_int(struct ppc_function *p, uint rt, int imm)
+{
+   ppc_lis(p, rt, (imm >> 16));          /* rt = imm >> 16 */
+   ppc_ori(p, rt, rt, (imm & 0xffff));   /* rt = rt | (imm & 0xffff) */
+}
+
+
+
+
+/**
+ ** integer load/store
+ **/
+
+/** store rs at memory[(ra)+d],
+ * then update ra = (ra)+d
+ */
+void
+ppc_stwu(struct ppc_function *p, uint rs, uint ra, int d)
+{
+   emit_d(p, 37, rs, ra, d);
+}
+
+/** store rs at memory[(ra)+d] */
+void
+ppc_stw(struct ppc_function *p, uint rs, uint ra, int d)
+{
+   emit_d(p, 36, rs, ra, d);
+}
+
+/** Load rt = mem[(ra)+d];  then zero set high 32 bits to zero. */
+void
+ppc_lwz(struct ppc_function *p, uint rt, uint ra, int d)
+{
+   emit_d(p, 32, rt, ra, d);
+}
+
+
+
+/**
+ ** Float (non-vector) arithmetic
+ **/
+
+/** add: frt = fra + frb */
+void
+ppc_fadd(struct ppc_function *p, uint frt, uint fra, uint frb)
+{
+   emit_a(p, 63, frt, fra, frb, 21, 0);
+}
+
+/** sub: frt = fra - frb */
+void
+ppc_fsub(struct ppc_function *p, uint frt, uint fra, uint frb)
+{
+   emit_a(p, 63, frt, fra, frb, 20, 0);
+}
+
+/** convert to int: rt = (int) ra */
+void
+ppc_fctiwz(struct ppc_function *p, uint rt, uint fra)
+{
+   emit_x(p, 63, rt, 0, fra, 15);
+}
+
+/** store frs at mem[(ra)+offset] */
+void
+ppc_stfs(struct ppc_function *p, uint frs, uint ra, int offset)
+{
+   emit_d(p, 52, frs, ra, offset);
+}
+
+/** store frs at mem[(ra)+(rb)] */
+void
+ppc_stfiwx(struct ppc_function *p, uint frs, uint ra, uint rb)
+{
+   emit_x(p, 31, frs, ra, rb, 983);
+}
+
+/** load frt = mem[(ra)+offset] */
+void
+ppc_lfs(struct ppc_function *p, uint frt, uint ra, int offset)
+{
+   emit_d(p, 48, frt, ra, offset);
+}
+
+
+
+
+
+/**
+ ** branch instructions
+ **/
+
+/** BLR: Branch to link register (p. 35) */
+void
+ppc_blr(struct ppc_function *p)
+{
+   emit_i(p, 18, 0, 0, 1);
+}
+
+/** Branch Conditional to Link Register (p. 36) */
+void
+ppc_bclr(struct ppc_function *p, uint condOp, uint branchHint, uint condReg)
+{
+   emit_xl(p, 19, condOp, condReg, branchHint, 16, 0);
+}
+
+/** Pseudo instruction: return from subroutine */
+void
+ppc_return(struct ppc_function *p)
+{
+   ppc_bclr(p, BRANCH_COND_ALWAYS, BRANCH_HINT_SUB_RETURN, 0);
+}
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
new file mode 100644
index 00000000000..08212a2a253
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -0,0 +1,335 @@
+/**************************************************************************
+ *
+ * Copyright (C) 2008 Tungsten Graphics, Inc.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * PPC code generation.
+ * \author Brian Paul
+ */
+
+
+#ifndef RTASM_PPC_H
+#define RTASM_PPC_H
+
+
+#include "pipe/p_compiler.h"
+
+
+#define PPC_INST_SIZE 4  /**< 4 bytes / instruction */
+
+#define PPC_NUM_REGS 32
+#define PPC_NUM_FP_REGS 32
+#define PPC_NUM_VEC_REGS 32
+
+/** Stack pointer register */
+#define PPC_REG_SP 1
+
+/** Branch conditions */
+#define BRANCH_COND_ALWAYS       0x14  /* binary 1z1zz (z=ignored) */
+
+/** Branch hints */
+#define BRANCH_HINT_SUB_RETURN   0x0   /* binary 00 */
+
+
+struct ppc_function
+{
+   uint32_t *store;  /**< instruction buffer */
+   uint num_inst;
+   uint max_inst;
+   uint32_t reg_used;   /** used/free general-purpose registers bitmask */
+   uint32_t fp_used;   /** used/free floating point registers bitmask */
+   uint32_t vec_used;   /** used/free vector registers bitmask */
+};
+
+
+
+extern void ppc_init_func(struct ppc_function *p);
+extern void ppc_release_func(struct ppc_function *p);
+extern uint ppc_num_instructions(const struct ppc_function *p);
+extern void (*ppc_get_func( struct ppc_function *p ))( void );
+extern void ppc_dump_func(const struct ppc_function *p);
+
+extern int ppc_reserve_register(struct ppc_function *p, int reg);
+extern int ppc_allocate_register(struct ppc_function *p);
+extern void ppc_release_register(struct ppc_function *p, int reg);
+extern int ppc_allocate_fp_register(struct ppc_function *p);
+extern void ppc_release_fp_register(struct ppc_function *p, int reg);
+extern int ppc_allocate_vec_register(struct ppc_function *p);
+extern void ppc_release_vec_register(struct ppc_function *p, int reg);
+
+
+
+/**
+ ** float vector arithmetic
+ **/
+
+/** vector float add */
+extern void
+ppc_vaddfp(struct ppc_function *p,uint vD, uint vA, uint vB);
+
+/** vector float substract */
+extern void
+ppc_vsubfp(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float min */
+extern void
+ppc_vminfp(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float max */
+extern void
+ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float mult add: vD = vA * vB + vC */
+extern void
+ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector float negative mult subtract: vD = vA - vB * vC */
+extern void
+ppc_vnmsubfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector float compare greater than */
+extern void
+ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float compare greater than or equal to */
+extern void
+ppc_vcmpgefpx(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float compare equal */
+extern void
+ppc_vcmpeqfpx(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float 2^x */
+extern void
+ppc_vexptefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float log2(x) */
+extern void
+ppc_vlogefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float reciprocol */
+extern void
+ppc_vrefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float reciprocol sqrt estimate */
+extern void
+ppc_vrsqrtefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to negative infinity */
+extern void
+ppc_vrfim(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to positive infinity */
+extern void
+ppc_vrfip(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to nearest int */
+extern void
+ppc_vrfin(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to int toward zero */
+extern void
+ppc_vrfiz(struct ppc_function *p, uint vD, uint vB);
+
+
+/** vector store: store vR at mem[vA+vB] */
+extern void
+ppc_stvx(struct ppc_function *p, uint vR, uint vA, uint vB);
+
+/** vector load: vR = mem[vA+vB] */
+extern void
+ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB);
+
+/** load vector element word: vR = mem_word[vA+vB] */
+extern void
+ppc_lvewx(struct ppc_function *p, uint vR, uint vA, uint vB);
+
+
+
+/**
+ ** vector bitwise operations
+ **/
+
+
+/** vector and */
+extern void
+ppc_vand(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector and complement */
+extern void
+ppc_vandc(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector or */
+extern void
+ppc_vor(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector nor */
+extern void
+ppc_vnor(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector xor */
+extern void
+ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** Pseudo-instruction: vector move */
+extern void
+ppc_vmove(struct ppc_function *p, uint vD, uint vA);
+
+/** Set vector register to {0,0,0,0} */
+extern void
+ppc_vzero(struct ppc_function *p, uint vr);
+
+
+
+/**
+ ** Vector shuffle / select / splat / etc
+ **/
+
+/** vector permute */
+extern void
+ppc_vperm(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector select */
+extern void
+ppc_vsel(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector splat byte */
+extern void
+ppc_vspltb(struct ppc_function *p, uint vD, uint vB, uint imm);
+
+/** vector splat half word */
+extern void
+ppc_vsplthw(struct ppc_function *p, uint vD, uint vB, uint imm);
+
+/** vector splat word */
+extern void
+ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm);
+
+/** vector splat signed immediate word */
+extern void
+ppc_vspltisw(struct ppc_function *p, uint vD, int imm);
+
+/** vector shift left word: vD[word] = vA[word] << (vB[word] & 0x1f) */
+extern void
+ppc_vslw(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+
+
+/**
+ ** scalar arithmetic
+ **/
+
+extern void
+ppc_add(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_addi(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_addis(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_and(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_andi(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_or(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_ori(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_xor(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_xori(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_mr(struct ppc_function *p, uint rt, uint ra);
+
+extern void
+ppc_li(struct ppc_function *p, uint rt, int imm);
+
+extern void
+ppc_lis(struct ppc_function *p, uint rt, int imm);
+
+extern void
+ppc_load_int(struct ppc_function *p, uint rt, int imm);
+
+
+
+/**
+ ** scalar load/store
+ **/
+
+extern void
+ppc_stwu(struct ppc_function *p, uint rs, uint ra, int d);
+
+extern void
+ppc_stw(struct ppc_function *p, uint rs, uint ra, int d);
+
+extern void
+ppc_lwz(struct ppc_function *p, uint rs, uint ra, int d);
+
+
+
+/**
+ ** Float (non-vector) arithmetic
+ **/
+
+extern void
+ppc_fadd(struct ppc_function *p, uint frt, uint fra, uint frb);
+
+extern void
+ppc_fsub(struct ppc_function *p, uint frt, uint fra, uint frb);
+
+extern void
+ppc_fctiwz(struct ppc_function *p, uint rt, uint ra);
+
+extern void
+ppc_stfs(struct ppc_function *p, uint frs, uint ra, int offset);
+
+extern void
+ppc_stfiwx(struct ppc_function *p, uint frs, uint ra, uint rb);
+
+extern void
+ppc_lfs(struct ppc_function *p, uint frt, uint ra, int offset);
+
+
+
+/**
+ ** branch instructions
+ **/
+
+extern void
+ppc_blr(struct ppc_function *p);
+
+void
+ppc_bclr(struct ppc_function *p, uint condOp, uint branchHint, uint condReg);
+
+extern void
+ppc_return(struct ppc_function *p);
+
+
+#endif /* RTASM_PPC_H */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index a04cc6c4ff7..b9a75ae5591 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -27,12 +27,16 @@
  * Real-time assembly generation interface for Cell B.E. SPEs.
  *
  * \author Ian Romanick <[email protected]>
+ * \author Brian Paul
  */
 
+
+#include <stdio.h>
 #include "pipe/p_compiler.h"
 #include "util/u_memory.h"
 #include "rtasm_ppc_spe.h"
 
+
 #ifdef GALLIUM_CELL
 /**
  * SPE instruction types
@@ -143,21 +147,91 @@ union spe_inst_RI18 {
 /*@}*/
 
 
+static void
+indent(const struct spe_function *p)
+{
+   int i;
+   for (i = 0; i < p->indent; i++) {
+      putchar(' ');
+   }
+}
+
+
+static const char *
+rem_prefix(const char *longname)
+{
+   return longname + 4;
+}
+
+
+static const char *
+reg_name(int reg)
+{
+   switch (reg) {
+   case SPE_REG_SP:
+      return "$sp";
+   case SPE_REG_RA:
+      return "$lr";
+   default:
+      {
+         /* cycle through four buffers to handle multiple calls per printf */
+         static char buf[4][10];
+         static int b = 0;
+         b = (b + 1) % 4;
+         sprintf(buf[b], "$%d", reg);
+         return buf[b];
+      }
+   }
+}
+
+
+static void
+emit_instruction(struct spe_function *p, uint32_t inst_bits)
+{
+   if (!p->store)
+      return;  /* out of memory, drop the instruction */
+
+   if (p->num_inst == p->max_inst) {
+      /* allocate larger buffer */
+      uint32_t *newbuf;
+      p->max_inst *= 2;  /* 2x larger */
+      newbuf = align_malloc(p->max_inst * SPE_INST_SIZE, 16);
+      if (newbuf) {
+         memcpy(newbuf, p->store, p->num_inst * SPE_INST_SIZE);
+      }
+      align_free(p->store);
+      p->store = newbuf;
+      if (!p->store) {
+         /* out of memory */
+         p->num_inst = 0;
+         return;
+      }
+   }
+
+   p->store[p->num_inst++] = inst_bits;
+}
+
+
+
 static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
-		    unsigned rA, unsigned rB)
+		    unsigned rA, unsigned rB, const char *name)
 {
     union spe_inst_RR inst;
     inst.inst.op = op;
     inst.inst.rB = rB;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
+    if (p->print) {
+       indent(p);
+       printf("%s\t%s, %s, %s\n",
+              rem_prefix(name), reg_name(rT), reg_name(rA), reg_name(rB));
+    }
 }
 
 
 static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
-		    unsigned rA, unsigned rB, unsigned rC)
+                     unsigned rA, unsigned rB, unsigned rC, const char *name)
 {
     union spe_inst_RRR inst;
     inst.inst.op = op;
@@ -165,155 +239,212 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rB = rB;
     inst.inst.rA = rA;
     inst.inst.rC = rC;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
+    if (p->print) {
+       indent(p);
+       printf("%s\t%s, %s, %s, %s\n", rem_prefix(name), reg_name(rT),
+              reg_name(rA), reg_name(rB), reg_name(rC));
+    }
 }
 
 
 static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
-		     unsigned rA, int imm)
+		     unsigned rA, int imm, const char *name)
 {
     union spe_inst_RI7 inst;
     inst.inst.op = op;
     inst.inst.i7 = imm;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
+    if (p->print) {
+       indent(p);
+       printf("%s\t%s, %s, 0x%x\n",
+              rem_prefix(name), reg_name(rT), reg_name(rA), imm);
+    }
 }
 
 
 
 static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
-		     unsigned rA, int imm)
+		     unsigned rA, int imm, const char *name)
 {
     union spe_inst_RI8 inst;
     inst.inst.op = op;
     inst.inst.i8 = imm;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
+    if (p->print) {
+       indent(p);
+       printf("%s\t%s, %s, 0x%x\n",
+              rem_prefix(name), reg_name(rT), reg_name(rA), imm);
+    }
 }
 
 
 
 static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
-		      unsigned rA, int imm)
+		      unsigned rA, int imm, const char *name)
 {
     union spe_inst_RI10 inst;
     inst.inst.op = op;
     inst.inst.i10 = imm;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
+    if (p->print) {
+       indent(p);
+       printf("%s\t%s, %s, 0x%x\n",
+              rem_prefix(name), reg_name(rT), reg_name(rA), imm);
+    }
+}
+
+
+/** As above, but do range checking on signed immediate value */
+static void emit_RI10s(struct spe_function *p, unsigned op, unsigned rT,
+                       unsigned rA, int imm, const char *name)
+{
+    assert(imm <= 511);
+    assert(imm >= -512);
+    emit_RI10(p, op, rT, rA, imm, name);
 }
 
 
 static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
-		      int imm)
+		      int imm, const char *name)
 {
     union spe_inst_RI16 inst;
     inst.inst.op = op;
     inst.inst.i16 = imm;
     inst.inst.rT = rT;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
+    if (p->print) {
+       indent(p);
+       printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
+    }
 }
 
 
 static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
-		      int imm)
+		      int imm, const char *name)
 {
     union spe_inst_RI18 inst;
     inst.inst.op = op;
     inst.inst.i18 = imm;
     inst.inst.rT = rT;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
+    if (p->print) {
+       indent(p);
+       printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
+    }
 }
 
 
-
+#define EMIT(_name, _op) \
+void _name (struct spe_function *p) \
+{ \
+   emit_RR(p, _op, 0, 0, 0, __FUNCTION__); \
+}
 
 #define EMIT_(_name, _op) \
 void _name (struct spe_function *p, unsigned rT) \
 { \
-    emit_RR(p, _op, rT, 0, 0); \
+   emit_RR(p, _op, rT, 0, 0, __FUNCTION__); \
 }
 
 #define EMIT_R(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA) \
 { \
-    emit_RR(p, _op, rT, rA, 0); \
+   emit_RR(p, _op, rT, rA, 0, __FUNCTION__);                 \
 }
 
 #define EMIT_RR(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB) \
 { \
-    emit_RR(p, _op, rT, rA, rB); \
+   emit_RR(p, _op, rT, rA, rB, __FUNCTION__);                \
 }
 
 #define EMIT_RRR(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB, unsigned rC) \
 { \
-    emit_RRR(p, _op, rT, rA, rB, rC); \
+   emit_RRR(p, _op, rT, rA, rB, rC, __FUNCTION__);           \
 }
 
 #define EMIT_RI7(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
 { \
-    emit_RI7(p, _op, rT, rA, imm); \
+   emit_RI7(p, _op, rT, rA, imm, __FUNCTION__);              \
 }
 
 #define EMIT_RI8(_name, _op, bias) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
 { \
-    emit_RI8(p, _op, rT, rA, bias - imm); \
+   emit_RI8(p, _op, rT, rA, bias - imm, __FUNCTION__);       \
 }
 
 #define EMIT_RI10(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
 { \
-    emit_RI10(p, _op, rT, rA, imm); \
+   emit_RI10(p, _op, rT, rA, imm, __FUNCTION__);             \
+}
+
+#define EMIT_RI10s(_name, _op) \
+void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
+{ \
+   emit_RI10s(p, _op, rT, rA, imm, __FUNCTION__);             \
 }
 
 #define EMIT_RI16(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, int imm) \
 { \
-    emit_RI16(p, _op, rT, imm); \
+   emit_RI16(p, _op, rT, imm, __FUNCTION__);                 \
 }
 
 #define EMIT_RI18(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, int imm) \
 { \
-    emit_RI18(p, _op, rT, imm); \
+   emit_RI18(p, _op, rT, imm, __FUNCTION__);                 \
 }
 
 #define EMIT_I16(_name, _op) \
 void _name (struct spe_function *p, int imm) \
 { \
-    emit_RI16(p, _op, 0, imm); \
+   emit_RI16(p, _op, 0, imm, __FUNCTION__);                  \
 }
 
 #include "rtasm_ppc_spe.h"
 
 
+
 /**
  * Initialize an spe_function.
- * \param code_size  size of instruction buffer to allocate, in bytes.
+ * \param code_size  initial size of instruction buffer to allocate, in bytes.
+ *                   If zero, use a default.
  */
 void spe_init_func(struct spe_function *p, unsigned code_size)
 {
-    p->store = align_malloc(code_size, 16);
+    unsigned int i;
+
+    if (!code_size)
+       code_size = 64;
+
     p->num_inst = 0;
     p->max_inst = code_size / SPE_INST_SIZE;
+    p->store = align_malloc(code_size, 16);
+
+    p->set_count = 0;
+    memset(p->regs, 0, SPE_NUM_REGS * sizeof(p->regs[0]));
 
     /* Conservatively treat R0 - R2 and R80 - R127 as non-volatile.
      */
-    p->regs[0] = ~7;
-    p->regs[1] = (1U << (80 - 64)) - 1;
+    p->regs[0] = p->regs[1] = p->regs[2] = 1;
+    for (i = 80; i <= 127; i++) {
+      p->regs[i] = 1;
+    }
+
+    p->print = false;
+    p->indent = 0;
 }
 
 
@@ -327,20 +458,23 @@ void spe_release_func(struct spe_function *p)
 }
 
 
+/** Return current code size in bytes. */
+unsigned spe_code_size(const struct spe_function *p)
+{
+   return p->num_inst * SPE_INST_SIZE;
+}
+
+
 /**
- * Alloate a SPE register.
+ * Allocate a SPE register.
  * \return register index or -1 if none left.
  */
 int spe_allocate_available_register(struct spe_function *p)
 {
    unsigned i;
    for (i = 0; i < SPE_NUM_REGS; i++) {
-      const uint64_t mask = (1ULL << (i % 64));
-      const unsigned idx = i / 64;
-
-      assert(idx < 2);
-      if ((p->regs[idx] & mask) != 0) {
-         p->regs[idx] &= ~mask;
+      if (p->regs[i] == 0) {
+         p->regs[i] = 1;
          return i;
       }
    }
@@ -354,31 +488,160 @@ int spe_allocate_available_register(struct spe_function *p)
  */
 int spe_allocate_register(struct spe_function *p, int reg)
 {
-   const unsigned idx = reg / 64;
-   const unsigned bit = reg % 64;
-
    assert(reg < SPE_NUM_REGS);
-   assert((p->regs[idx] & (1ULL << bit)) != 0);
-
-   p->regs[idx] &= ~(1ULL << bit);
+   assert(p->regs[reg] == 0);
+   p->regs[reg] = 1;
    return reg;
 }
 
 
 /**
- * Mark the given SPE register as "unallocated".
+ * Mark the given SPE register as "unallocated".  Note that this should
+ * only be used on registers allocated in the current register set; an
+ * assertion will fail if an attempt is made to deallocate a register
+ * allocated in an earlier register set.
  */
 void spe_release_register(struct spe_function *p, int reg)
 {
-   const unsigned idx = reg / 64;
-   const unsigned bit = reg % 64;
+   assert(reg < SPE_NUM_REGS);
+   assert(p->regs[reg] == 1);
 
-   assert(idx < 2);
+   p->regs[reg] = 0;
+}
+
+/**
+ * Start a new set of registers.  This can be called if
+ * it will be difficult later to determine exactly what
+ * registers were actually allocated during a code generation
+ * sequence, and you really just want to deallocate all of them.
+ */
+void spe_allocate_register_set(struct spe_function *p)
+{
+   unsigned int i;
+
+   /* Keep track of the set count.  If it ever wraps around to 0, 
+    * we're in trouble.
+    */
+   p->set_count++;
+   assert(p->set_count > 0);
+
+   /* Increment the allocation count of all registers currently
+    * allocated.  Then any registers that are allocated in this set
+    * will be the only ones with a count of 1; they'll all be released
+    * when the register set is released.
+    */
+   for (i = 0; i < SPE_NUM_REGS; i++) {
+      if (p->regs[i] > 0)
+         p->regs[i]++;
+   }
+}
+
+void spe_release_register_set(struct spe_function *p)
+{
+   unsigned int i;
+
+   /* If the set count drops below zero, we're in trouble. */
+   assert(p->set_count > 0);
+   p->set_count--;
+
+   /* Drop the allocation level of all registers.  Any allocated
+    * during this register set will drop to 0 and then become
+    * available.
+    */
+   for (i = 0; i < SPE_NUM_REGS; i++) {
+      if (p->regs[i] > 0)
+         p->regs[i]--;
+   }
+}
+
+
+unsigned
+spe_get_registers_used(const struct spe_function *p, ubyte used[])
+{
+   unsigned i, num = 0;
+   /* only count registers in the range available to callers */
+   for (i = 2; i < 80; i++) {
+      if (p->regs[i]) {
+         used[num++] = i;
+      }
+   }
+   return num;
+}
+
+
+void
+spe_print_code(struct spe_function *p, boolean enable)
+{
+   p->print = enable;
+}
 
-   assert(reg < SPE_NUM_REGS);
-   assert((p->regs[idx] & (1ULL << bit)) == 0);
 
-   p->regs[idx] |= (1ULL << bit);
+void
+spe_indent(struct spe_function *p, int spaces)
+{
+   p->indent += spaces;
+}
+
+
+void
+spe_comment(struct spe_function *p, int rel_indent, const char *s)
+{
+   if (p->print) {
+      p->indent += rel_indent;
+      indent(p);
+      p->indent -= rel_indent;
+      printf("# %s\n", s);
+   }
+}
+
+
+/**
+ * Load quad word.
+ * NOTE: offset is in bytes and the least significant 4 bits must be zero!
+ */
+void spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
+{
+   const boolean pSave = p->print;
+
+   /* offset must be a multiple of 16 */
+   assert(offset % 16 == 0);
+   /* offset must fit in 10-bit signed int field, after shifting */
+   assert((offset >> 4) <= 511);
+   assert((offset >> 4) >= -512);
+
+   p->print = FALSE;
+   emit_RI10(p, 0x034, rT, rA, offset >> 4, "spe_lqd");
+   p->print = pSave;
+
+   if (p->print) {
+      indent(p);
+      printf("lqd\t%s, %d(%s)\n", reg_name(rT), offset, reg_name(rA));
+   }
+}
+
+
+/**
+ * Store quad word.
+ * NOTE: offset is in bytes and the least significant 4 bits must be zero!
+ */
+void spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
+{
+   const boolean pSave = p->print;
+
+   /* offset must be a multiple of 16 */
+   assert(offset % 16 == 0);
+   /* offset must fit in 10-bit signed int field, after shifting */
+   assert((offset >> 4) <= 511);
+   assert((offset >> 4) >= -512);
+
+   p->print = FALSE;
+   emit_RI10(p, 0x024, rT, rA, offset >> 4, "spe_stqd");
+   p->print = pSave;
+
+   if (p->print) {
+      indent(p);
+      printf("stqd\t%s, %d(%s)\n", reg_name(rT), offset, reg_name(rA));
+   }
 }
 
 
@@ -392,51 +655,51 @@ void spe_release_register(struct spe_function *p, int reg)
 /** Branch Indirect to address in rA */
 void spe_bi(struct spe_function *p, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Interupt Return */
 void spe_iret(struct spe_function *p, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect and set link on external data */
 void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA, int d,
 		int e)
 {
-    emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect and set link.  Save PC in rT, jump to rA. */
 void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA, int d,
 		int e)
 {
-    emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if zero word.  If rT.word[0]==0, jump to rA. */
 void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if non-zero word.  If rT.word[0]!=0, jump to rA. */
 void spe_binz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if zero halfword.  If rT.halfword[1]==0, jump to rA. */
 void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if non-zero halfword.  If rT.halfword[1]!=0, jump to rA. */
 void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 
@@ -454,7 +717,6 @@ hbrr;
 #if 0
 stop;
 EMIT_RR  (spe_stopd, 0x140);
-EMIT_    (spe_lnop,  0x001);
 EMIT_    (spe_nop,   0x201);
 sync;
 EMIT_    (spe_dsync, 0x003);
@@ -505,30 +767,226 @@ spe_load_int(struct spe_function *p, unsigned rT, int i)
    }
    else {
       spe_ilhu(p, rT, i >> 16);
-      spe_iohl(p, rT, i & 0xffff);
+      if (i & 0xffff)
+         spe_iohl(p, rT, i & 0xffff);
+   }
+}
+
+void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
+{
+   /* If the whole value is in the lower 18 bits, use ila, which
+    * doesn't sign-extend.  Otherwise, if the two halfwords of
+    * the constant are identical, use ilh.  Otherwise, if every byte of
+    * the desired value is 0x00 or 0xff, we can use Form Select Mask for
+    * Bytes Immediate (fsmbi) to load the value in a single instruction.
+    * Otherwise, in the general case, we have to use ilhu followed by iohl.
+    */
+   if ((ui & 0x0003ffff) == ui) {
+      spe_ila(p, rT, ui);
+   }
+   else if ((ui >> 16) == (ui & 0xffff)) {
+      spe_ilh(p, rT, ui & 0xffff);
+   }
+   else if (
+      ((ui & 0x000000ff) == 0 || (ui & 0x000000ff) == 0x000000ff) &&
+      ((ui & 0x0000ff00) == 0 || (ui & 0x0000ff00) == 0x0000ff00) &&
+      ((ui & 0x00ff0000) == 0 || (ui & 0x00ff0000) == 0x00ff0000) &&
+      ((ui & 0xff000000) == 0 || (ui & 0xff000000) == 0xff000000)
+   ) {
+      unsigned int mask = 0;
+      /* fsmbi duplicates each bit in the given mask eight times,
+       * using a 16-bit value to initialize a 16-byte quadword.
+       * Each 4-bit nybble of the mask corresponds to a full word
+       * of the result; look at the value and figure out the mask
+       * (replicated for each word in the quadword), and then
+       * form the "select mask" to get the value.
+       */
+      if ((ui & 0x000000ff) == 0x000000ff) mask |= 0x1111;
+      if ((ui & 0x0000ff00) == 0x0000ff00) mask |= 0x2222;
+      if ((ui & 0x00ff0000) == 0x00ff0000) mask |= 0x4444;
+      if ((ui & 0xff000000) == 0xff000000) mask |= 0x8888;
+      spe_fsmbi(p, rT, mask);
+   }
+   else {
+      /* The general case: this usually uses two instructions, but
+       * may use only one if the low-order 16 bits of each word are 0.
+       */
+      spe_ilhu(p, rT, ui >> 16);
+      if (ui & 0xffff)
+         spe_iohl(p, rT, ui & 0xffff);
+   }
+}
+
+/**
+ * This function is constructed identically to spe_xor_uint() below.
+ * Changes to one should be made in the other.
+ */
+void
+spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If we can, emit a single instruction, either And Byte Immediate
+    * (which uses the same constant across each byte), And Halfword Immediate
+    * (which sign-extends a 10-bit immediate to 16 bits and uses that
+    * across each halfword), or And Word Immediate (which sign-extends
+    * a 10-bit immediate to 32 bits).
+    *
+    * Otherwise, we'll need to use a temporary register.
+    */
+   unsigned int tmp;
+
+   /* If the upper 23 bits are all 0s or all 1s, sign extension
+    * will work and we can use And Word Immediate
+    */
+   tmp = ui & 0xfffffe00;
+   if (tmp == 0xfffffe00 || tmp  == 0) {
+      spe_andi(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+   
+   /* If the ui field is symmetric along halfword boundaries and
+    * the upper 7 bits of each halfword are all 0s or 1s, we
+    * can use And Halfword Immediate
+    */
+   tmp = ui & 0xfe00fe00;
+   if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
+      spe_andhi(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+
+   /* If the ui field is symmetric in each byte, then we can use
+    * the And Byte Immediate instruction.
+    */
+   tmp = ui & 0x000000ff;
+   if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
+      spe_andbi(p, rT, rA, tmp);
+      return;
+   }
+
+   /* Otherwise, we'll have to use a temporary register. */
+   unsigned int tmp_reg = spe_allocate_available_register(p);
+   spe_load_uint(p, tmp_reg, ui);
+   spe_and(p, rT, rA, tmp_reg);
+   spe_release_register(p, tmp_reg);
+}
+
+
+/**
+ * This function is constructed identically to spe_and_uint() above.
+ * Changes to one should be made in the other.
+ */
+void
+spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If we can, emit a single instruction, either Exclusive Or Byte 
+    * Immediate (which uses the same constant across each byte), Exclusive 
+    * Or Halfword Immediate (which sign-extends a 10-bit immediate to 
+    * 16 bits and uses that across each halfword), or Exclusive Or Word 
+    * Immediate (which sign-extends a 10-bit immediate to 32 bits).
+    *
+    * Otherwise, we'll need to use a temporary register.
+    */
+   unsigned int tmp;
+
+   /* If the upper 23 bits are all 0s or all 1s, sign extension
+    * will work and we can use Exclusive Or Word Immediate
+    */
+   tmp = ui & 0xfffffe00;
+   if (tmp == 0xfffffe00 || tmp  == 0) {
+      spe_xori(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+   
+   /* If the ui field is symmetric along halfword boundaries and
+    * the upper 7 bits of each halfword are all 0s or 1s, we
+    * can use Exclusive Or Halfword Immediate
+    */
+   tmp = ui & 0xfe00fe00;
+   if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
+      spe_xorhi(p, rT, rA, ui & 0x000003ff);
+      return;
    }
+
+   /* If the ui field is symmetric in each byte, then we can use
+    * the Exclusive Or Byte Immediate instruction.
+    */
+   tmp = ui & 0x000000ff;
+   if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
+      spe_xorbi(p, rT, rA, tmp);
+      return;
+   }
+
+   /* Otherwise, we'll have to use a temporary register. */
+   unsigned int tmp_reg = spe_allocate_available_register(p);
+   spe_load_uint(p, tmp_reg, ui);
+   spe_xor(p, rT, rA, tmp_reg);
+   spe_release_register(p, tmp_reg);
 }
 
+void
+spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If the comparison value is 9 bits or less, it fits inside a
+    * Compare Equal Word Immediate instruction.
+    */
+   if ((ui & 0x000001ff) == ui) {
+      spe_ceqi(p, rT, rA, ui);
+   }
+   /* Otherwise, we're going to have to load a word first. */
+   else {
+      unsigned int tmp_reg = spe_allocate_available_register(p);
+      spe_load_uint(p, tmp_reg, ui);
+      spe_ceq(p, rT, rA, tmp_reg);
+      spe_release_register(p, tmp_reg);
+   }
+}
+
+void
+spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If the comparison value is 10 bits or less, it fits inside a
+    * Compare Logical Greater Than Word Immediate instruction.
+    */
+   if ((ui & 0x000003ff) == ui) {
+      spe_clgti(p, rT, rA, ui);
+   }
+   /* Otherwise, we're going to have to load a word first. */
+   else {
+      unsigned int tmp_reg = spe_allocate_available_register(p);
+      spe_load_uint(p, tmp_reg, ui);
+      spe_clgt(p, rT, rA, tmp_reg);
+      spe_release_register(p, tmp_reg);
+   }
+}
 
 void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
 {
-   spe_ila(p, rT, 66051);
-   spe_shufb(p, rT, rA, rA, rT);
+   /* Use a temporary, just in case rT == rA */
+   unsigned int tmp_reg = spe_allocate_available_register(p);
+   /* Duplicate bytes 0, 1, 2, and 3 across the whole register */
+   spe_ila(p, tmp_reg, 0x00010203);
+   spe_shufb(p, rT, rA, rA, tmp_reg);
+   spe_release_register(p, tmp_reg);
 }
 
 
 void
-spe_complement(struct spe_function *p, unsigned rT)
+spe_complement(struct spe_function *p, unsigned rT, unsigned rA)
 {
-   spe_nor(p, rT, rT, rT);
+   spe_nor(p, rT, rA, rA);
 }
 
 
 void
 spe_move(struct spe_function *p, unsigned rT, unsigned rA)
 {
-   spe_ori(p, rT, rA, 0);
+   /* Use different instructions depending on the instruction address
+    * to take advantage of the dual pipelines.
+    */
+   if (p->num_inst & 1)
+      spe_shlqbyi(p, rT, rA, 0);  /* odd pipe */
+   else
+      spe_ori(p, rT, rA, 0);  /* even pipe */
 }
 
 
@@ -539,4 +997,70 @@ spe_zero(struct spe_function *p, unsigned rT)
 }
 
 
+void
+spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word)
+{
+   assert(word >= 0);
+   assert(word <= 3);
+
+   if (word == 0) {
+      int tmp1 = rT;
+      spe_ila(p, tmp1, 66051);
+      spe_shufb(p, rT, rA, rA, tmp1);
+   }
+   else {
+      /* XXX review this, we may not need the rotqbyi instruction */
+      int tmp1 = rT;
+      int tmp2 = spe_allocate_available_register(p);
+
+      spe_ila(p, tmp1, 66051);
+      spe_rotqbyi(p, tmp2, rA, 4 * word);
+      spe_shufb(p, rT, tmp2, tmp2, tmp1);
+
+      spe_release_register(p, tmp2);
+   }
+}
+
+/**
+ * For each 32-bit float element of rA and rB, choose the smaller of the
+ * two, compositing them into the rT register.
+ * 
+ * The Float Compare Greater Than (fcgt) instruction will put 1s into
+ * compare_reg where rA > rB, and 0s where rA <= rB.
+ *
+ * Then the Select Bits (selb) instruction will take bits from rA where
+ * compare_reg is 0, and from rB where compare_reg is 1; i.e., from rA
+ * where rA <= rB and from rB where rB > rA, which is exactly the
+ * "min" operation.
+ *
+ * The compare_reg could in many cases be the same as rT, unless
+ * rT == rA || rt == rB.  But since this is common in constructions
+ * like "x = min(x, a)", we always allocate a new register to be safe.
+ */
+void 
+spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
+{
+   unsigned int compare_reg = spe_allocate_available_register(p);
+   spe_fcgt(p, compare_reg, rA, rB);
+   spe_selb(p, rT, rA, rB, compare_reg);
+   spe_release_register(p, compare_reg);
+}
+
+/**
+ * For each 32-bit float element of rA and rB, choose the greater of the
+ * two, compositing them into the rT register.
+ * 
+ * The logic is similar to that of spe_float_min() above; the only
+ * difference is that the registers on spe_selb() have been reversed,
+ * so that the larger of the two is selected instead of the smaller.
+ */
+void 
+spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
+{
+   unsigned int compare_reg = spe_allocate_available_register(p);
+   spe_fcgt(p, compare_reg, rA, rB);
+   spe_selb(p, rT, rB, rA, compare_reg);
+   spe_release_register(p, compare_reg);
+}
+
 #endif /* GALLIUM_CELL */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index d95e5aace34..f9ad2acacdd 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -28,6 +28,7 @@
  * For details, see /opt/cell/sdk/docs/arch/SPU_ISA_v1.2_27Jan2007_pub.pdf
  *
  * \author Ian Romanick <[email protected]>
+ * \author Brian Paul
  */
 
 #ifndef RTASM_PPC_SPE_H
@@ -39,10 +40,10 @@
 /** number of general-purpose SIMD registers */
 #define SPE_NUM_REGS  128
 
-/** Return Address register */
+/** Return Address register (aka $lr / Link Register) */
 #define SPE_REG_RA  0
 
-/** Stack Pointer register */
+/** Stack Pointer register (aka $sp) */
 #define SPE_REG_SP  1
 
 
@@ -52,221 +53,258 @@ struct spe_function
    uint num_inst;
    uint max_inst;
 
-    /**
-     * Mask of used / unused registers
-     *
-     * Each set bit corresponds to an available register.  Each cleared bit
-     * corresponds to an allocated register.
+   /**
+    * The "set count" reflects the number of nested register sets
+    * are allowed.  In the unlikely case that we exceed the set count,
+    * register allocation will start to be confused, which is critical
+    * enough that we check for it.
+    */
+   unsigned char set_count;
+
+   /** 
+    * Flags for used and unused registers.  Each byte corresponds to a
+    * register; a 0 in that byte means that the register is available.
+    * A value of 1 means that the register was allocated in the current
+    * register set.  Any other value N means that the register was allocated
+    * N register sets ago.
      *
      * \sa
      * spe_allocate_register, spe_allocate_available_register,
-     * spe_release_register
+     * spe_allocate_register_set, spe_release_register_set, spe_release_register, 
      */
-    uint64_t regs[SPE_NUM_REGS / 64];
+    unsigned char regs[SPE_NUM_REGS];
+
+    boolean print; /**< print/dump instructions as they're emitted? */
+    int indent;    /**< number of spaces to indent */
 };
 
+
 extern void spe_init_func(struct spe_function *p, unsigned code_size);
 extern void spe_release_func(struct spe_function *p);
+extern unsigned spe_code_size(const struct spe_function *p);
 
 extern int spe_allocate_available_register(struct spe_function *p);
 extern int spe_allocate_register(struct spe_function *p, int reg);
 extern void spe_release_register(struct spe_function *p, int reg);
+extern void spe_allocate_register_set(struct spe_function *p);
+extern void spe_release_register_set(struct spe_function *p);
+
+extern unsigned
+spe_get_registers_used(const struct spe_function *p, ubyte used[]);
+
+extern void spe_print_code(struct spe_function *p, boolean enable);
+extern void spe_indent(struct spe_function *p, int spaces);
+extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
+
 
 #endif /* RTASM_PPC_SPE_H */
 
-#ifndef EMIT_
-#define EMIT_(name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT)
+#ifndef EMIT
+#define EMIT(_name, _op) \
+    extern void _name (struct spe_function *p);
+#define EMIT_(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT);
 #define EMIT_R(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, unsigned rA)
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA);
 #define EMIT_RR(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-			   unsigned rB)
+                       unsigned rB);
 #define EMIT_RRR(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-			   unsigned rB, unsigned rC)
+                       unsigned rB, unsigned rC);
 #define EMIT_RI7(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-			   int imm)
+                       int imm);
 #define EMIT_RI8(_name, _op, bias) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-			   int imm)
+                       int imm);
 #define EMIT_RI10(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-			   int imm)
+                       int imm);
+#define EMIT_RI10s(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
+                       int imm);
 #define EMIT_RI16(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, int imm)
+    extern void _name (struct spe_function *p, unsigned rT, int imm);
 #define EMIT_RI18(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, int imm)
+    extern void _name (struct spe_function *p, unsigned rT, int imm);
 #define EMIT_I16(_name, _op) \
-    extern void _name (struct spe_function *p, int imm)
+    extern void _name (struct spe_function *p, int imm);
 #define UNDEF_EMIT_MACROS
-#endif /* EMIT_ */
+#endif /* EMIT */
 
 
 /* Memory load / store instructions
  */
-EMIT_RI10(spe_lqd,  0x034);
-EMIT_RR  (spe_lqx,  0x1c4);
-EMIT_RI16(spe_lqa,  0x061);
-EMIT_RI16(spe_lqr,  0x067);
-EMIT_RI10(spe_stqd, 0x024);
-EMIT_RR  (spe_stqx, 0x144);
-EMIT_RI16(spe_stqa, 0x041);
-EMIT_RI16(spe_stqr, 0x047);
-EMIT_RI7 (spe_cbd,  0x1f4);
-EMIT_RR  (spe_cbx,  0x1d4);
-EMIT_RI7 (spe_chd,  0x1f5);
-EMIT_RI7 (spe_chx,  0x1d5);
-EMIT_RI7 (spe_cwd,  0x1f6);
-EMIT_RI7 (spe_cwx,  0x1d6);
-EMIT_RI7 (spe_cdd,  0x1f7);
-EMIT_RI7 (spe_cdx,  0x1d7);
+EMIT_RR  (spe_lqx,  0x1c4)
+EMIT_RI16(spe_lqa,  0x061)
+EMIT_RI16(spe_lqr,  0x067)
+EMIT_RR  (spe_stqx, 0x144)
+EMIT_RI16(spe_stqa, 0x041)
+EMIT_RI16(spe_stqr, 0x047)
+EMIT_RI7 (spe_cbd,  0x1f4)
+EMIT_RR  (spe_cbx,  0x1d4)
+EMIT_RI7 (spe_chd,  0x1f5)
+EMIT_RI7 (spe_chx,  0x1d5)
+EMIT_RI7 (spe_cwd,  0x1f6)
+EMIT_RI7 (spe_cwx,  0x1d6)
+EMIT_RI7 (spe_cdd,  0x1f7)
+EMIT_RI7 (spe_cdx,  0x1d7)
 
 
 /* Constant formation instructions
  */
-EMIT_RI16(spe_ilh,   0x083);
-EMIT_RI16(spe_ilhu,  0x082);
-EMIT_RI16(spe_il,    0x081);
-EMIT_RI18(spe_ila,   0x021);
-EMIT_RI16(spe_iohl,  0x0c1);
-EMIT_RI16(spe_fsmbi, 0x065);
+EMIT_RI16(spe_ilh,   0x083)
+EMIT_RI16(spe_ilhu,  0x082)
+EMIT_RI16(spe_il,    0x081)
+EMIT_RI18(spe_ila,   0x021)
+EMIT_RI16(spe_iohl,  0x0c1)
+EMIT_RI16(spe_fsmbi, 0x065)
 
 
 
 /* Integer and logical instructions
  */
-EMIT_RR  (spe_ah,      0x0c8);
-EMIT_RI10(spe_ahi,     0x01d);
-EMIT_RR  (spe_a,       0x0c0);
-EMIT_RI10(spe_ai,      0x01c);
-EMIT_RR  (spe_sfh,     0x048);
-EMIT_RI10(spe_sfhi,    0x00d);
-EMIT_RR  (spe_sf,      0x040);
-EMIT_RI10(spe_sfi,     0x00c);
-EMIT_RR  (spe_addx,    0x340);
-EMIT_RR  (spe_cg,      0x0c2);
-EMIT_RR  (spe_cgx,     0x342);
-EMIT_RR  (spe_sfx,     0x341);
-EMIT_RR  (spe_bg,      0x042);
-EMIT_RR  (spe_bgx,     0x343);
-EMIT_RR  (spe_mpy,     0x3c4);
-EMIT_RR  (spe_mpyu,    0x3cc);
-EMIT_RI10(spe_mpyi,    0x074);
-EMIT_RI10(spe_mpyui,   0x075);
-EMIT_RRR (spe_mpya,    0x00c);
-EMIT_RR  (spe_mpyh,    0x3c5);
-EMIT_RR  (spe_mpys,    0x3c7);
-EMIT_RR  (spe_mpyhh,   0x3c6);
-EMIT_RR  (spe_mpyhha,  0x346);
-EMIT_RR  (spe_mpyhhu,  0x3ce);
-EMIT_RR  (spe_mpyhhau, 0x34e);
-EMIT_R   (spe_clz,     0x2a5);
-EMIT_R   (spe_cntb,    0x2b4);
-EMIT_R   (spe_fsmb,    0x1b6);
-EMIT_R   (spe_fsmh,    0x1b5);
-EMIT_R   (spe_fsm,     0x1b4);
-EMIT_R   (spe_gbb,     0x1b2);
-EMIT_R   (spe_gbh,     0x1b1);
-EMIT_R   (spe_gb,      0x1b0);
-EMIT_RR  (spe_avgb,    0x0d3);
-EMIT_RR  (spe_absdb,   0x053);
-EMIT_RR  (spe_sumb,    0x253);
-EMIT_R   (spe_xsbh,    0x2b6);
-EMIT_R   (spe_xshw,    0x2ae);
-EMIT_R   (spe_xswd,    0x2a6);
-EMIT_RR  (spe_and,     0x0c1);
-EMIT_RR  (spe_andc,    0x2c1);
-EMIT_RI10(spe_andbi,   0x016);
-EMIT_RI10(spe_andhi,   0x015);
-EMIT_RI10(spe_andi,    0x014);
-EMIT_RR  (spe_or,      0x041);
-EMIT_RR  (spe_orc,     0x2c9);
-EMIT_RI10(spe_orbi,    0x006);
-EMIT_RI10(spe_orhi,    0x005);
-EMIT_RI10(spe_ori,     0x004);
-EMIT_R   (spe_orx,     0x1f0);
-EMIT_RR  (spe_xor,     0x241);
-EMIT_RI10(spe_xorbi,   0x026);
-EMIT_RI10(spe_xorhi,   0x025);
-EMIT_RI10(spe_xori,    0x024);
-EMIT_RR  (spe_nand,    0x0c9);
-EMIT_RR  (spe_nor,     0x049);
-EMIT_RR  (spe_eqv,     0x249);
-EMIT_RRR (spe_selb,    0x008);
-EMIT_RRR (spe_shufb,   0x00b);
+EMIT_RR  (spe_ah,      0x0c8)
+EMIT_RI10(spe_ahi,     0x01d)
+EMIT_RR  (spe_a,       0x0c0)
+EMIT_RI10s(spe_ai,      0x01c)
+EMIT_RR  (spe_sfh,     0x048)
+EMIT_RI10(spe_sfhi,    0x00d)
+EMIT_RR  (spe_sf,      0x040)
+EMIT_RI10(spe_sfi,     0x00c)
+EMIT_RR  (spe_addx,    0x340)
+EMIT_RR  (spe_cg,      0x0c2)
+EMIT_RR  (spe_cgx,     0x342)
+EMIT_RR  (spe_sfx,     0x341)
+EMIT_RR  (spe_bg,      0x042)
+EMIT_RR  (spe_bgx,     0x343)
+EMIT_RR  (spe_mpy,     0x3c4)
+EMIT_RR  (spe_mpyu,    0x3cc)
+EMIT_RI10(spe_mpyi,    0x074)
+EMIT_RI10(spe_mpyui,   0x075)
+EMIT_RRR (spe_mpya,    0x00c)
+EMIT_RR  (spe_mpyh,    0x3c5)
+EMIT_RR  (spe_mpys,    0x3c7)
+EMIT_RR  (spe_mpyhh,   0x3c6)
+EMIT_RR  (spe_mpyhha,  0x346)
+EMIT_RR  (spe_mpyhhu,  0x3ce)
+EMIT_RR  (spe_mpyhhau, 0x34e)
+EMIT_R   (spe_clz,     0x2a5)
+EMIT_R   (spe_cntb,    0x2b4)
+EMIT_R   (spe_fsmb,    0x1b6)
+EMIT_R   (spe_fsmh,    0x1b5)
+EMIT_R   (spe_fsm,     0x1b4)
+EMIT_R   (spe_gbb,     0x1b2)
+EMIT_R   (spe_gbh,     0x1b1)
+EMIT_R   (spe_gb,      0x1b0)
+EMIT_RR  (spe_avgb,    0x0d3)
+EMIT_RR  (spe_absdb,   0x053)
+EMIT_RR  (spe_sumb,    0x253)
+EMIT_R   (spe_xsbh,    0x2b6)
+EMIT_R   (spe_xshw,    0x2ae)
+EMIT_R   (spe_xswd,    0x2a6)
+EMIT_RR  (spe_and,     0x0c1)
+EMIT_RR  (spe_andc,    0x2c1)
+EMIT_RI10s(spe_andbi,   0x016)
+EMIT_RI10s(spe_andhi,   0x015)
+EMIT_RI10s(spe_andi,    0x014)
+EMIT_RR  (spe_or,      0x041)
+EMIT_RR  (spe_orc,     0x2c9)
+EMIT_RI10s(spe_orbi,    0x006)
+EMIT_RI10s(spe_orhi,    0x005)
+EMIT_RI10s(spe_ori,     0x004)
+EMIT_R   (spe_orx,     0x1f0)
+EMIT_RR  (spe_xor,     0x241)
+EMIT_RI10s(spe_xorbi,   0x046)
+EMIT_RI10s(spe_xorhi,   0x045)
+EMIT_RI10s(spe_xori,    0x044)
+EMIT_RR  (spe_nand,    0x0c9)
+EMIT_RR  (spe_nor,     0x049)
+EMIT_RR  (spe_eqv,     0x249)
+EMIT_RRR (spe_selb,    0x008)
+EMIT_RRR (spe_shufb,   0x00b)
 
 
 /* Shift and rotate instructions
  */
-EMIT_RR  (spe_shlh,      0x05f);
-EMIT_RI7 (spe_shlhi,     0x07f);
-EMIT_RR  (spe_shl,       0x05b);
-EMIT_RI7 (spe_shli,      0x07b);
-EMIT_RR  (spe_shlqbi,    0x1db);
-EMIT_RI7 (spe_shlqbii,   0x1fb);
-EMIT_RR  (spe_shlqby,    0x1df);
-EMIT_RI7 (spe_shlqbyi,   0x1ff);
-EMIT_RR  (spe_shlqbybi,  0x1cf);
-EMIT_RR  (spe_roth,      0x05c);
-EMIT_RI7 (spe_rothi,     0x07c);
-EMIT_RR  (spe_rot,       0x058);
-EMIT_RI7 (spe_roti,      0x078);
-EMIT_RR  (spe_rotqby,    0x1dc);
-EMIT_RI7 (spe_rotqbyi,   0x1fc);
-EMIT_RR  (spe_rotqbybi,  0x1cc);
-EMIT_RR  (spe_rotqbi,    0x1d8);
-EMIT_RI7 (spe_rotqbii,   0x1f8);
-EMIT_RR  (spe_rothm,     0x05d);
-EMIT_RI7 (spe_rothmi,    0x07d);
-EMIT_RR  (spe_rotm,      0x059);
-EMIT_RI7 (spe_rotmi,     0x079);
-EMIT_RR  (spe_rotqmby,   0x1dd);
-EMIT_RI7 (spe_rotqmbyi,  0x1fd);
-EMIT_RR  (spe_rotqmbybi, 0x1cd);
-EMIT_RR  (spe_rotqmbi,   0x1c9);
-EMIT_RI7 (spe_rotqmbii,  0x1f9);
-EMIT_RR  (spe_rotmah,    0x05e);
-EMIT_RI7 (spe_rotmahi,   0x07e);
-EMIT_RR  (spe_rotma,     0x05a);
-EMIT_RI7 (spe_rotmai,    0x07a);
+EMIT_RR  (spe_shlh,      0x05f)
+EMIT_RI7 (spe_shlhi,     0x07f)
+EMIT_RR  (spe_shl,       0x05b)
+EMIT_RI7 (spe_shli,      0x07b)
+EMIT_RR  (spe_shlqbi,    0x1db)
+EMIT_RI7 (spe_shlqbii,   0x1fb)
+EMIT_RR  (spe_shlqby,    0x1df)
+EMIT_RI7 (spe_shlqbyi,   0x1ff)
+EMIT_RR  (spe_shlqbybi,  0x1cf)
+EMIT_RR  (spe_roth,      0x05c)
+EMIT_RI7 (spe_rothi,     0x07c)
+EMIT_RR  (spe_rot,       0x058)
+EMIT_RI7 (spe_roti,      0x078)
+EMIT_RR  (spe_rotqby,    0x1dc)
+EMIT_RI7 (spe_rotqbyi,   0x1fc)
+EMIT_RR  (spe_rotqbybi,  0x1cc)
+EMIT_RR  (spe_rotqbi,    0x1d8)
+EMIT_RI7 (spe_rotqbii,   0x1f8)
+EMIT_RR  (spe_rothm,     0x05d)
+EMIT_RI7 (spe_rothmi,    0x07d)
+EMIT_RR  (spe_rotm,      0x059)
+EMIT_RI7 (spe_rotmi,     0x079)
+EMIT_RR  (spe_rotqmby,   0x1dd)
+EMIT_RI7 (spe_rotqmbyi,  0x1fd)
+EMIT_RR  (spe_rotqmbybi, 0x1cd)
+EMIT_RR  (spe_rotqmbi,   0x1c9)
+EMIT_RI7 (spe_rotqmbii,  0x1f9)
+EMIT_RR  (spe_rotmah,    0x05e)
+EMIT_RI7 (spe_rotmahi,   0x07e)
+EMIT_RR  (spe_rotma,     0x05a)
+EMIT_RI7 (spe_rotmai,    0x07a)
 
 
 /* Compare, branch, and halt instructions
  */
-EMIT_RR  (spe_heq,       0x3d8);
-EMIT_RI10(spe_heqi,      0x07f);
-EMIT_RR  (spe_hgt,       0x258);
-EMIT_RI10(spe_hgti,      0x04f);
-EMIT_RR  (spe_hlgt,      0x2d8);
-EMIT_RI10(spe_hlgti,     0x05f);
-EMIT_RR  (spe_ceqb,      0x3d0);
-EMIT_RI10(spe_ceqbi,     0x07e);
-EMIT_RR  (spe_ceqh,      0x3c8);
-EMIT_RI10(spe_ceqhi,     0x07d);
-EMIT_RR  (spe_ceq,       0x3c0);
-EMIT_RI10(spe_ceqi,      0x07c);
-EMIT_RR  (spe_cgtb,      0x250);
-EMIT_RI10(spe_cgtbi,     0x04e);
-EMIT_RR  (spe_cgth,      0x248);
-EMIT_RI10(spe_cgthi,     0x04d);
-EMIT_RR  (spe_cgt,       0x240);
-EMIT_RI10(spe_cgti,      0x04c);
-EMIT_RR  (spe_clgtb,     0x2d0);
-EMIT_RI10(spe_clgtbi,    0x05e);
-EMIT_RR  (spe_clgth,     0x2c8);
-EMIT_RI10(spe_clgthi,    0x05d);
-EMIT_RR  (spe_clgt,      0x2c0);
-EMIT_RI10(spe_clgti,     0x05c);
-EMIT_I16 (spe_br,        0x064);
-EMIT_I16 (spe_bra,       0x060);
-EMIT_RI16(spe_brsl,      0x066);
-EMIT_RI16(spe_brasl,     0x062);
-EMIT_RI16(spe_brnz,      0x042);
-EMIT_RI16(spe_brz,       0x040);
-EMIT_RI16(spe_brhnz,     0x046);
-EMIT_RI16(spe_brhz,      0x044);
+EMIT_RR  (spe_heq,       0x3d8)
+EMIT_RI10(spe_heqi,      0x07f)
+EMIT_RR  (spe_hgt,       0x258)
+EMIT_RI10(spe_hgti,      0x04f)
+EMIT_RR  (spe_hlgt,      0x2d8)
+EMIT_RI10(spe_hlgti,     0x05f)
+EMIT_RR  (spe_ceqb,      0x3d0)
+EMIT_RI10(spe_ceqbi,     0x07e)
+EMIT_RR  (spe_ceqh,      0x3c8)
+EMIT_RI10(spe_ceqhi,     0x07d)
+EMIT_RR  (spe_ceq,       0x3c0)
+EMIT_RI10(spe_ceqi,      0x07c)
+EMIT_RR  (spe_cgtb,      0x250)
+EMIT_RI10(spe_cgtbi,     0x04e)
+EMIT_RR  (spe_cgth,      0x248)
+EMIT_RI10(spe_cgthi,     0x04d)
+EMIT_RR  (spe_cgt,       0x240)
+EMIT_RI10(spe_cgti,      0x04c)
+EMIT_RR  (spe_clgtb,     0x2d0)
+EMIT_RI10(spe_clgtbi,    0x05e)
+EMIT_RR  (spe_clgth,     0x2c8)
+EMIT_RI10(spe_clgthi,    0x05d)
+EMIT_RR  (spe_clgt,      0x2c0)
+EMIT_RI10(spe_clgti,     0x05c)
+EMIT_I16 (spe_br,        0x064)
+EMIT_I16 (spe_bra,       0x060)
+EMIT_RI16(spe_brsl,      0x066)
+EMIT_RI16(spe_brasl,     0x062)
+EMIT_RI16(spe_brnz,      0x042)
+EMIT_RI16(spe_brz,       0x040)
+EMIT_RI16(spe_brhnz,     0x046)
+EMIT_RI16(spe_brhz,      0x044)
+
+/* Control instructions
+ */
+EMIT     (spe_lnop,      0x001)
+
+extern void
+spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
+
+extern void
+spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
 
 extern void spe_bi(struct spe_function *p, unsigned rA, int d, int e);
 extern void spe_iret(struct spe_function *p, unsigned rA, int d, int e);
@@ -292,13 +330,33 @@ spe_load_float(struct spe_function *p, unsigned rT, float x);
 extern void
 spe_load_int(struct spe_function *p, unsigned rT, int i);
 
+/** Load/splat immediate unsigned int into rT. */
+extern void
+spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui);
+
+/** And immediate value into rT. */
+extern void
+spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Xor immediate value into rT. */
+extern void
+spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Compare equal with immediate value. */
+extern void
+spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Compare greater with immediate value. */
+extern void
+spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
 /** Replicate word 0 of rA across rT. */
 extern void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA);
 
-/** Complement/invert all bits in rT. */
+/** rT = complement_all_bits(rA). */
 extern void
-spe_complement(struct spe_function *p, unsigned rT);
+spe_complement(struct spe_function *p, unsigned rT, unsigned rA);
 
 /** rT = rA. */
 extern void
@@ -308,52 +366,65 @@ spe_move(struct spe_function *p, unsigned rT, unsigned rA);
 extern void
 spe_zero(struct spe_function *p, unsigned rT);
 
+/** rT = splat(rA, word) */
+extern void
+spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word);
+
+/** rT = float min(rA, rB) */
+extern void
+spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB);
+
+/** rT = float max(rA, rB) */
+extern void
+spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB);
+
 
 /* Floating-point instructions
  */
-EMIT_RR  (spe_fa,         0x2c4);
-EMIT_RR  (spe_dfa,        0x2cc);
-EMIT_RR  (spe_fs,         0x2c5);
-EMIT_RR  (spe_dfs,        0x2cd);
-EMIT_RR  (spe_fm,         0x2c6);
-EMIT_RR  (spe_dfm,        0x2ce);
-EMIT_RRR (spe_fma,        0x00e);
-EMIT_RR  (spe_dfma,       0x35c);
-EMIT_RRR (spe_fnms,       0x00d);
-EMIT_RR  (spe_dfnms,      0x35e);
-EMIT_RRR (spe_fms,        0x00f);
-EMIT_RR  (spe_dfms,       0x35d);
-EMIT_RR  (spe_dfnma,      0x35f);
-EMIT_R   (spe_frest,      0x1b8);
-EMIT_R   (spe_frsqest,    0x1b9);
-EMIT_RR  (spe_fi,         0x3d4);
-EMIT_RI8 (spe_csflt,      0x1da, 155);
-EMIT_RI8 (spe_cflts,      0x1d8, 173);
-EMIT_RI8 (spe_cuflt,      0x1db, 155);
-EMIT_RI8 (spe_cfltu,      0x1d9, 173);
-EMIT_R   (spe_frds,       0x3b9);
-EMIT_R   (spe_fesd,       0x3b8);
-EMIT_RR  (spe_dfceq,      0x3c3);
-EMIT_RR  (spe_dfcmeq,     0x3cb);
-EMIT_RR  (spe_dfcgt,      0x2c3);
-EMIT_RR  (spe_dfcmgt,     0x2cb);
-EMIT_RI7 (spe_dftsv,      0x3bf);
-EMIT_RR  (spe_fceq,       0x3c2);
-EMIT_RR  (spe_fcmeq,      0x3ca);
-EMIT_RR  (spe_fcgt,       0x2c2);
-EMIT_RR  (spe_fcmgt,      0x2ca);
-EMIT_R   (spe_fscrwr,     0x3ba);
-EMIT_    (spe_fscrrd,     0x398);
+EMIT_RR  (spe_fa,         0x2c4)
+EMIT_RR  (spe_dfa,        0x2cc)
+EMIT_RR  (spe_fs,         0x2c5)
+EMIT_RR  (spe_dfs,        0x2cd)
+EMIT_RR  (spe_fm,         0x2c6)
+EMIT_RR  (spe_dfm,        0x2ce)
+EMIT_RRR (spe_fma,        0x00e)
+EMIT_RR  (spe_dfma,       0x35c)
+EMIT_RRR (spe_fnms,       0x00d)
+EMIT_RR  (spe_dfnms,      0x35e)
+EMIT_RRR (spe_fms,        0x00f)
+EMIT_RR  (spe_dfms,       0x35d)
+EMIT_RR  (spe_dfnma,      0x35f)
+EMIT_R   (spe_frest,      0x1b8)
+EMIT_R   (spe_frsqest,    0x1b9)
+EMIT_RR  (spe_fi,         0x3d4)
+EMIT_RI8 (spe_csflt,      0x1da, 155)
+EMIT_RI8 (spe_cflts,      0x1d8, 173)
+EMIT_RI8 (spe_cuflt,      0x1db, 155)
+EMIT_RI8 (spe_cfltu,      0x1d9, 173)
+EMIT_R   (spe_frds,       0x3b9)
+EMIT_R   (spe_fesd,       0x3b8)
+EMIT_RR  (spe_dfceq,      0x3c3)
+EMIT_RR  (spe_dfcmeq,     0x3cb)
+EMIT_RR  (spe_dfcgt,      0x2c3)
+EMIT_RR  (spe_dfcmgt,     0x2cb)
+EMIT_RI7 (spe_dftsv,      0x3bf)
+EMIT_RR  (spe_fceq,       0x3c2)
+EMIT_RR  (spe_fcmeq,      0x3ca)
+EMIT_RR  (spe_fcgt,       0x2c2)
+EMIT_RR  (spe_fcmgt,      0x2ca)
+EMIT_R   (spe_fscrwr,     0x3ba)
+EMIT_    (spe_fscrrd,     0x398)
 
 
 /* Channel instructions
  */
-EMIT_R   (spe_rdch,       0x00d);
-EMIT_R   (spe_rdchcnt,    0x00f);
-EMIT_R   (spe_wrch,       0x10d);
+EMIT_R   (spe_rdch,       0x00d)
+EMIT_R   (spe_rdchcnt,    0x00f)
+EMIT_R   (spe_wrch,       0x10d)
 
 
 #ifdef UNDEF_EMIT_MACROS
+#undef EMIT
 #undef EMIT_
 #undef EMIT_R
 #undef EMIT_RR
@@ -361,6 +432,7 @@ EMIT_R   (spe_wrch,       0x10d);
 #undef EMIT_RI7
 #undef EMIT_RI8
 #undef EMIT_RI10
+#undef EMIT_RI10s
 #undef EMIT_RI16
 #undef EMIT_RI18
 #undef EMIT_I16
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index ad9d8f8ced9..99ee74cf14b 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -240,7 +240,8 @@ static void emit_modrm( struct x86_function *p,
    /* Oh-oh we've stumbled into the SIB thing.
     */
    if (regmem.file == file_REG32 &&
-       regmem.idx == reg_SP) {
+       regmem.idx == reg_SP &&
+       regmem.mod != mod_REG) {
       emit_1ub(p, 0x24);		/* simplistic! */
    }
 
@@ -439,25 +440,70 @@ void x86_call( struct x86_function *p, struct x86_reg reg)
 }
 
 
-/* michal:
- * Temporary. As I need immediate operands, and dont want to mess with the codegen,
- * I load the immediate into general purpose register and use it.
- */
 void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm )
 {
    DUMP_RI( dst, imm );
+   assert(dst.file == file_REG32);
    assert(dst.mod == mod_REG);
    emit_1ub(p, 0xb8 + dst.idx);
    emit_1i(p, imm);
 }
 
-void x86_add_reg_imm8( struct x86_function *p, struct x86_reg dst, ubyte imm )
+/**
+ * Immediate group 1 instructions.
+ */
+static INLINE void 
+x86_group1_imm( struct x86_function *p, 
+                unsigned op, struct x86_reg dst, int imm )
 {
-   DUMP_RI( dst, imm );
+   assert(dst.file == file_REG32);
    assert(dst.mod == mod_REG);
-   emit_1ub(p, 0x80);
-   emit_modrm_noreg(p, 0, dst);
-   emit_1ub(p, imm);
+   if(-0x80 <= imm && imm < 0x80) {
+      emit_1ub(p, 0x83);
+      emit_modrm_noreg(p, op, dst);
+      emit_1b(p, (char)imm);
+   }
+   else {
+      emit_1ub(p, 0x81);
+      emit_modrm_noreg(p, op, dst);
+      emit_1i(p, imm);
+   }
+}
+
+void x86_add_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 0, dst, imm);
+}
+
+void x86_or_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 1, dst, imm);
+}
+
+void x86_and_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 4, dst, imm);
+}
+
+void x86_sub_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 5, dst, imm);
+}
+
+void x86_xor_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 6, dst, imm);
+}
+
+void x86_cmp_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 7, dst, imm);
 }
 
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index af79f07dd39..1b5eaaca850 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -152,12 +152,13 @@ void x86_jmp( struct x86_function *p, int label );
 /* void x86_call( struct x86_function *p, void (*label)() ); */
 void x86_call( struct x86_function *p, struct x86_reg reg);
 
-/* michal:
- * Temporary. As I need immediate operands, and dont want to mess with the codegen,
- * I load the immediate into general purpose register and use it.
- */
 void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm );
-void x86_add_reg_imm8( struct x86_function *p, struct x86_reg dst, ubyte imm );
+void x86_add_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_or_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_and_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_sub_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_xor_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_cmp_imm( struct x86_function *p, struct x86_reg dst, int imm );
 
 
 /* Macro for sse_shufps() and sse2_pshufd():
diff --git a/src/gallium/auxiliary/tgsi/Makefile b/src/gallium/auxiliary/tgsi/Makefile
index c7155a93168..d7df9490cfa 100644
--- a/src/gallium/auxiliary/tgsi/Makefile
+++ b/src/gallium/auxiliary/tgsi/Makefile
@@ -11,6 +11,7 @@ C_SOURCES = \
 	tgsi_info.c \
 	tgsi_iterate.c \
 	tgsi_parse.c \
+	tgsi_ppc.c \
 	tgsi_scan.c \
 	tgsi_sse2.c \
 	tgsi_text.c \
diff --git a/src/gallium/auxiliary/tgsi/SConscript b/src/gallium/auxiliary/tgsi/SConscript
index 45bf3f6d577..8200cce42f5 100644
--- a/src/gallium/auxiliary/tgsi/SConscript
+++ b/src/gallium/auxiliary/tgsi/SConscript
@@ -12,6 +12,7 @@ tgsi = env.ConvenienceLibrary(
 		'tgsi_parse.c',
 		'tgsi_sanity.c',
 		'tgsi_scan.c',
+		'tgsi_ppc.c',
 		'tgsi_sse2.c',
 		'tgsi_text.c',
 		'tgsi_transform.c',
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index 74614d36884..eee2db7771a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -119,6 +119,8 @@ tgsi_default_declaration( void )
    declaration.UsageMask = TGSI_WRITEMASK_XYZW;
    declaration.Interpolate = TGSI_INTERPOLATE_CONSTANT;
    declaration.Semantic = 0;
+   declaration.Centroid = 0;
+   declaration.Invariant = 0;
    declaration.Padding = 0;
    declaration.Extended = 0;
 
@@ -793,10 +795,14 @@ tgsi_default_instruction_ext_nv( void )
    return instruction_ext_nv;
 }
 
-union token_u32
+
+/** test for inequality of 32-bit values pointed to by a and b */
+static INLINE boolean
+compare32(const void *a, const void *b)
 {
-   unsigned u32;
-};
+   return *((uint32_t *) a) != *((uint32_t *) b);
+}
+
 
 unsigned
 tgsi_compare_instruction_ext_nv(
@@ -805,7 +811,7 @@ tgsi_compare_instruction_ext_nv(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_instruction_ext_nv
@@ -864,7 +870,7 @@ tgsi_compare_instruction_ext_label(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_instruction_ext_label
@@ -905,7 +911,7 @@ tgsi_compare_instruction_ext_texture(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_instruction_ext_texture
@@ -1027,7 +1033,7 @@ tgsi_compare_src_register_ext_swz(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_src_register_ext_swz
@@ -1095,7 +1101,7 @@ tgsi_compare_src_register_ext_mod(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_src_register_ext_mod
@@ -1241,7 +1247,7 @@ tgsi_compare_dst_register_ext_concode(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_dst_register_ext_concode
@@ -1299,7 +1305,7 @@ tgsi_compare_dst_register_ext_modulate(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_dst_register_ext_modulate
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index 3177f549523..c2a0ac5aff8 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -246,6 +246,14 @@ iter_declaration(
    TXT( ", " );
    ENM( decl->Declaration.Interpolate, interpolate_names );
 
+   if (decl->Declaration.Centroid) {
+      TXT( ", CENTROID" );
+   }
+
+   if (decl->Declaration.Invariant) {
+      TXT( ", INVARIANT" );
+   }
+
    EOL();
 
    return TRUE;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 41dffc3dbaf..0fdfb91d393 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -133,7 +133,7 @@ tgsi_exec_machine_bind_shader(
    struct tgsi_exec_machine *mach,
    const struct tgsi_token *tokens,
    uint numSamplers,
-   struct tgsi_sampler *samplers)
+   struct tgsi_sampler **samplers)
 {
    uint k;
    struct tgsi_parse_context parse;
@@ -467,17 +467,6 @@ micro_exp2(
 }
 
 static void
-micro_f2it(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->i[0] = (int) src->f[0];
-   dst->i[1] = (int) src->f[1];
-   dst->i[2] = (int) src->f[2];
-   dst->i[3] = (int) src->f[3];
-}
-
-static void
 micro_f2ut(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src )
@@ -958,14 +947,22 @@ fetch_src_file_channel(
       switch( file ) {
       case TGSI_FILE_CONSTANT:
          assert(mach->Consts);
-         assert(index->i[0] >= 0);
-         assert(index->i[1] >= 0);
-         assert(index->i[2] >= 0);
-         assert(index->i[3] >= 0);
-         chan->f[0] = mach->Consts[index->i[0]][swizzle];
-         chan->f[1] = mach->Consts[index->i[1]][swizzle];
-         chan->f[2] = mach->Consts[index->i[2]][swizzle];
-         chan->f[3] = mach->Consts[index->i[3]][swizzle];
+         if (index->i[0] < 0)
+            chan->f[0] = 0.0f;
+         else
+            chan->f[0] = mach->Consts[index->i[0]][swizzle];
+         if (index->i[1] < 0)
+            chan->f[1] = 0.0f;
+         else
+            chan->f[1] = mach->Consts[index->i[1]][swizzle];
+         if (index->i[2] < 0)
+            chan->f[2] = 0.0f;
+         else
+            chan->f[2] = mach->Consts[index->i[2]][swizzle];
+         if (index->i[3] < 0)
+            chan->f[3] = 0.0f;
+         else
+            chan->f[3] = mach->Consts[index->i[3]][swizzle];
          break;
 
       case TGSI_FILE_INPUT:
@@ -1037,11 +1034,28 @@ fetch_source(
    union tgsi_exec_channel index;
    uint swizzle;
 
+   /* We start with a direct index into a register file.
+    *
+    *    file[1],
+    *    where:
+    *       file = SrcRegister.File
+    *       [1] = SrcRegister.Index
+    */
    index.i[0] =
    index.i[1] =
    index.i[2] =
    index.i[3] = reg->SrcRegister.Index;
 
+   /* There is an extra source register that indirectly subscripts
+    * a register file. The direct index now becomes an offset
+    * that is being added to the indirect register.
+    *
+    *    file[ind[2].x+1],
+    *    where:
+    *       ind = SrcRegisterInd.File
+    *       [2] = SrcRegisterInd.Index
+    *       .x = SrcRegisterInd.SwizzleX
+    */
    if (reg->SrcRegister.Indirect) {
       union tgsi_exec_channel index2;
       union tgsi_exec_channel indir_index;
@@ -1064,10 +1078,10 @@ fetch_source(
          &indir_index );
 
       /* add value of address register to the offset */
-      index.i[0] += indir_index.i[0];
-      index.i[1] += indir_index.i[1];
-      index.i[2] += indir_index.i[2];
-      index.i[3] += indir_index.i[3];
+      index.i[0] += (int) indir_index.f[0];
+      index.i[1] += (int) indir_index.f[1];
+      index.i[2] += (int) indir_index.f[2];
+      index.i[3] += (int) indir_index.f[3];
 
       /* for disabled execution channels, zero-out the index to
        * avoid using a potential garbage value.
@@ -1078,19 +1092,31 @@ fetch_source(
       }
    }
 
-   if( reg->SrcRegister.Dimension ) {
-      switch( reg->SrcRegister.File ) {
+   /* There is an extra source register that is a second
+    * subscript to a register file. Effectively it means that
+    * the register file is actually a 2D array of registers.
+    *
+    *    file[1][3] == file[1*sizeof(file[1])+3],
+    *    where:
+    *       [3] = SrcRegisterDim.Index
+    */
+   if (reg->SrcRegister.Dimension) {
+      /* The size of the first-order array depends on the register file type.
+       * We need to multiply the index to the first array to get an effective,
+       * "flat" index that points to the beginning of the second-order array.
+       */
+      switch (reg->SrcRegister.File) {
       case TGSI_FILE_INPUT:
-         index.i[0] *= 17;
-         index.i[1] *= 17;
-         index.i[2] *= 17;
-         index.i[3] *= 17;
+         index.i[0] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
+         index.i[1] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
+         index.i[2] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
+         index.i[3] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
          break;
       case TGSI_FILE_CONSTANT:
-         index.i[0] *= 4096;
-         index.i[1] *= 4096;
-         index.i[2] *= 4096;
-         index.i[3] *= 4096;
+         index.i[0] *= TGSI_EXEC_MAX_CONST_BUFFER;
+         index.i[1] *= TGSI_EXEC_MAX_CONST_BUFFER;
+         index.i[2] *= TGSI_EXEC_MAX_CONST_BUFFER;
+         index.i[3] *= TGSI_EXEC_MAX_CONST_BUFFER;
          break;
       default:
          assert( 0 );
@@ -1101,6 +1127,17 @@ fetch_source(
       index.i[2] += reg->SrcRegisterDim.Index;
       index.i[3] += reg->SrcRegisterDim.Index;
 
+      /* Again, the second subscript index can be addressed indirectly
+       * identically to the first one.
+       * Nothing stops us from indirectly addressing the indirect register,
+       * but there is no need for that, so we won't exercise it.
+       *
+       *    file[1][ind[4].y+3],
+       *    where:
+       *       ind = SrcRegisterDimInd.File
+       *       [4] = SrcRegisterDimInd.Index
+       *       .y = SrcRegisterDimInd.SwizzleX
+       */
       if (reg->SrcRegisterDim.Indirect) {
          union tgsi_exec_channel index2;
          union tgsi_exec_channel indir_index;
@@ -1120,10 +1157,10 @@ fetch_source(
             &index2,
             &indir_index );
 
-         index.i[0] += indir_index.i[0];
-         index.i[1] += indir_index.i[1];
-         index.i[2] += indir_index.i[2];
-         index.i[3] += indir_index.i[3];
+         index.i[0] += (int) indir_index.f[0];
+         index.i[1] += (int) indir_index.f[1];
+         index.i[2] += (int) indir_index.f[2];
+         index.i[3] += (int) indir_index.f[3];
 
          /* for disabled execution channels, zero-out the index to
           * avoid using a potential garbage value.
@@ -1133,6 +1170,11 @@ fetch_source(
                index.i[i] = 0;
          }
       }
+
+      /* If by any chance there was a need for a 3D array of register
+       * files, we would have to check whether SrcRegisterDim is followed
+       * by a dimension register and continue the saga.
+       */
    }
 
    swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
@@ -1539,7 +1581,7 @@ exec_tex(struct tgsi_exec_machine *mach,
       else
          lodBias = 0.0;
 
-      fetch_texel(&mach->Samplers[unit],
+      fetch_texel(mach->Samplers[unit],
                   &r[0], NULL, NULL, lodBias,  /* S, T, P, BIAS */
                   &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
       break;
@@ -1565,7 +1607,7 @@ exec_tex(struct tgsi_exec_machine *mach,
       else
          lodBias = 0.0;
 
-      fetch_texel(&mach->Samplers[unit],
+      fetch_texel(mach->Samplers[unit],
                   &r[0], &r[1], &r[2], lodBias,  /* inputs */
                   &r[0], &r[1], &r[2], &r[3]);  /* outputs */
       break;
@@ -1591,7 +1633,7 @@ exec_tex(struct tgsi_exec_machine *mach,
       else
          lodBias = 0.0;
 
-      fetch_texel(&mach->Samplers[unit],
+      fetch_texel(mach->Samplers[unit],
                   &r[0], &r[1], &r[2], lodBias,
                   &r[0], &r[1], &r[2], &r[3]);
       break;
@@ -1701,6 +1743,7 @@ exec_declaration(
             break;
 
          default:
+            eval = NULL;
             assert( 0 );
          }
 
@@ -1743,7 +1786,7 @@ exec_instruction(
    case TGSI_OPCODE_ARL:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_f2it( &r[0], &r[0] );
+         micro_trunc( &r[0], &r[0] );
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2033,7 +2076,21 @@ exec_instruction(
 
    case TGSI_OPCODE_DOT2ADD:
       /* TGSI_OPCODE_DP2A */
-      assert (0);
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      micro_mul( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[2], 2, CHAN_X );
+      micro_add( &r[0], &r[0], &r[2] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
       break;
 
    case TGSI_OPCODE_INDEX:
@@ -2478,7 +2535,8 @@ exec_instruction(
          micro_mul( &dot, &r[2], &r[2] );
          micro_add( &tmp, &tmp, &dot );
 
-         /* tmp = 1 / tmp */
+         /* tmp = 1 / sqrt(tmp) */
+         micro_sqrt( &tmp, &tmp );
          micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
 
          /* note: w channel is undefined */
@@ -2511,7 +2569,8 @@ exec_instruction(
          micro_mul( &dot, &r[3], &r[3] );
          micro_add( &tmp, &tmp, &dot );
 
-         /* tmp = 1 / tmp */
+         /* tmp = 1 / sqrt(tmp) */
+         micro_sqrt( &tmp, &tmp );
          micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
 
          FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index fc40a25e09f..4ffd4efbffa 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -68,17 +68,12 @@ struct tgsi_interp_coef
    float dady[NUM_CHANNELS];
 };
 
-
-struct softpipe_tile_cache;  /**< Opaque to TGSI */
-
 /**
  * Information for sampling textures, which must be implemented
  * by code outside the TGSI executor.
  */
 struct tgsi_sampler
 {
-   const struct pipe_sampler_state *state;
-   struct pipe_texture *texture;
    /** Get samples for four fragments in a quad */
    void (*get_samples)(struct tgsi_sampler *sampler,
                        const float s[QUAD_SIZE],
@@ -86,8 +81,6 @@ struct tgsi_sampler
                        const float p[QUAD_SIZE],
                        float lodbias,
                        float rgba[NUM_CHANNELS][QUAD_SIZE]);
-   void *pipe; /*XXX temporary*/
-   struct softpipe_tile_cache *cache;
 };
 
 /**
@@ -178,6 +171,16 @@ struct tgsi_exec_labels
 #define TGSI_EXEC_MAX_LOOP_NESTING  20
 #define TGSI_EXEC_MAX_CALL_NESTING  20
 
+/* The maximum number of input attributes per vertex. For 2D
+ * input register files, this is the stride between two 1D
+ * arrays.
+ */
+#define TGSI_EXEC_MAX_INPUT_ATTRIBS 17
+
+/* The maximum number of constant vectors per constant buffer.
+ */
+#define TGSI_EXEC_MAX_CONST_BUFFER  4096
+
 /**
  * Run-time virtual machine state for executing TGSI shader.
  */
@@ -195,7 +198,7 @@ struct tgsi_exec_machine
    struct tgsi_exec_vector       *Temps;
    struct tgsi_exec_vector       *Addrs;
 
-   struct tgsi_sampler           *Samplers;
+   struct tgsi_sampler           **Samplers;
 
    float                         Imms[TGSI_EXEC_NUM_IMMEDIATES][4];
    unsigned                      ImmLimit;
@@ -258,7 +261,7 @@ tgsi_exec_machine_bind_shader(
    struct tgsi_exec_machine *mach,
    const struct tgsi_token *tokens,
    uint numSamplers,
-   struct tgsi_sampler *samplers);
+   struct tgsi_sampler **samplers);
 
 uint
 tgsi_exec_machine_run(
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index 3757486ba9b..2cd56e413a5 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -88,16 +88,33 @@ tgsi_parse_end_of_tokens(
       1 + ctx->FullHeader.Header.HeaderSize + ctx->FullHeader.Header.BodySize;
 }
 
+
+/**
+ * This function is used to avoid and work-around type punning/aliasing
+ * warnings.  The warnings seem harmless on x86 but on PPC they cause
+ * real failures.
+ */
+static INLINE void
+copy_token(void *dst, const void *src)
+{
+   memcpy(dst, src, 4);
+}
+
+
+/**
+ * Get next 4-byte token, return it at address specified by 'token'
+ */
 static void
 next_token(
    struct tgsi_parse_context *ctx,
    void *token )
 {
    assert( !tgsi_parse_end_of_tokens( ctx ) );
-
-   *(struct tgsi_token *) token = ctx->Tokens[ctx->Position++];
+   copy_token(token, &ctx->Tokens[ctx->Position]);
+   ctx->Position++;
 }
 
+
 void
 tgsi_parse_token(
    struct tgsi_parse_context *ctx )
@@ -116,7 +133,7 @@ tgsi_parse_token(
       struct tgsi_full_declaration *decl = &ctx->FullToken.FullDeclaration;
 
       *decl = tgsi_default_full_declaration();
-      decl->Declaration = *(struct tgsi_declaration *) &token;
+      copy_token(&decl->Declaration, &token);
 
       next_token( ctx, &decl->DeclarationRange );
 
@@ -132,8 +149,7 @@ tgsi_parse_token(
       struct tgsi_full_immediate *imm = &ctx->FullToken.FullImmediate;
 
       *imm = tgsi_default_full_immediate();
-      imm->Immediate = *(struct tgsi_immediate *) &token;
-
+      copy_token(&imm->Immediate, &token);
       assert( !imm->Immediate.Extended );
 
       switch (imm->Immediate.DataType) {
@@ -158,8 +174,7 @@ tgsi_parse_token(
       unsigned extended;
 
       *inst = tgsi_default_full_instruction();
-      inst->Instruction = *(struct tgsi_instruction *) &token;
-
+      copy_token(&inst->Instruction, &token);
       extended = inst->Instruction.Extended;
 
       while( extended ) {
@@ -169,18 +184,15 @@ tgsi_parse_token(
 
          switch( token.Type ) {
          case TGSI_INSTRUCTION_EXT_TYPE_NV:
-            inst->InstructionExtNv =
-               *(struct tgsi_instruction_ext_nv *) &token;
+            copy_token(&inst->InstructionExtNv, &token);
             break;
 
          case TGSI_INSTRUCTION_EXT_TYPE_LABEL:
-            inst->InstructionExtLabel =
-               *(struct tgsi_instruction_ext_label *) &token;
+            copy_token(&inst->InstructionExtLabel, &token);
             break;
 
          case TGSI_INSTRUCTION_EXT_TYPE_TEXTURE:
-            inst->InstructionExtTexture =
-               *(struct tgsi_instruction_ext_texture *) &token;
+            copy_token(&inst->InstructionExtTexture, &token);
             break;
 
          default:
@@ -212,13 +224,13 @@ tgsi_parse_token(
 
             switch( token.Type ) {
             case TGSI_DST_REGISTER_EXT_TYPE_CONDCODE:
-               inst->FullDstRegisters[i].DstRegisterExtConcode =
-                  *(struct tgsi_dst_register_ext_concode *) &token;
+               copy_token(&inst->FullDstRegisters[i].DstRegisterExtConcode,
+                          &token);
                break;
 
             case TGSI_DST_REGISTER_EXT_TYPE_MODULATE:
-               inst->FullDstRegisters[i].DstRegisterExtModulate =
-                  *(struct tgsi_dst_register_ext_modulate *) &token;
+               copy_token(&inst->FullDstRegisters[i].DstRegisterExtModulate,
+                          &token);
                break;
 
             default:
@@ -245,13 +257,13 @@ tgsi_parse_token(
 
             switch( token.Type ) {
             case TGSI_SRC_REGISTER_EXT_TYPE_SWZ:
-               inst->FullSrcRegisters[i].SrcRegisterExtSwz =
-                  *(struct tgsi_src_register_ext_swz *) &token;
+               copy_token(&inst->FullSrcRegisters[i].SrcRegisterExtSwz,
+                          &token);
                break;
 
             case TGSI_SRC_REGISTER_EXT_TYPE_MOD:
-               inst->FullSrcRegisters[i].SrcRegisterExtMod =
-                  *(struct tgsi_src_register_ext_mod *) &token;
+               copy_token(&inst->FullSrcRegisters[i].SrcRegisterExtMod,
+                          &token);
                break;
 
             default:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
new file mode 100644
index 00000000000..a92b1902e3d
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -0,0 +1,1329 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * TGSI to PowerPC code generation.
+ */
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_PPC)
+
+#include "pipe/p_debug.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_sse.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi_dump.h"
+#include "tgsi_exec.h"
+#include "tgsi_ppc.h"
+#include "rtasm/rtasm_ppc.h"
+
+
+/**
+ * Since it's pretty much impossible to form PPC vector immediates, load
+ * them from memory here:
+ */
+const float ppc_builtin_constants[] ALIGN16_ATTRIB = {
+   1.0f, -128.0f, 128.0, 0.0
+};
+
+
+#define FOR_EACH_CHANNEL( CHAN )\
+   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
+
+#define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
+   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
+   if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
+
+#define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
+   FOR_EACH_CHANNEL( CHAN )\
+      IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
+
+#define CHAN_X 0
+#define CHAN_Y 1
+#define CHAN_Z 2
+#define CHAN_W 3
+
+
+/**
+ * How many TGSI temps should be implemented with real PPC vector registers
+ * rather than memory.
+ */
+#define MAX_PPC_TEMPS 4
+
+
+struct reg_chan_vec
+{
+   struct tgsi_full_src_register src;
+   uint chan;
+   uint vec;
+};
+
+
+/**
+ * Context/state used during code gen.
+ */
+struct gen_context
+{
+   struct ppc_function *f;
+   int inputs_reg;    /**< GP register pointing to input params */
+   int outputs_reg;   /**< GP register pointing to output params */
+   int temps_reg;     /**< GP register pointing to temporary "registers" */
+   int immed_reg;     /**< GP register pointing to immediates buffer */
+   int const_reg;     /**< GP register pointing to constants buffer */
+   int builtins_reg;  /**< GP register pointint to built-in constants */
+
+   int offset_reg;    /**< used to reduce redundant li instructions */
+   int offset_value;
+
+   int one_vec;       /**< vector register with {1.0, 1.0, 1.0, 1.0} */
+   int bit31_vec;     /**< vector register with {1<<31, 1<<31, 1<<31, 1<<31} */
+
+   /**
+    * Map TGSI temps to PPC vector temps.
+    * We have 32 PPC vector regs.  Use 16 of them for storing 4 TGSI temps.
+    * XXX currently only do this for TGSI temps [0..MAX_PPC_TEMPS-1].
+    */
+   int temps_map[MAX_PPC_TEMPS][4];
+
+   /**
+    * Cache of src registers.
+    * This is used to avoid redundant load instructions.
+    */
+   struct {
+      struct tgsi_full_src_register src;
+      uint chan;
+      uint vec;
+   } regs[12];  /* 3 src regs, 4 channels */
+   uint num_regs;
+};
+
+
+/**
+ * Initialize code generation context.
+ */
+static void
+init_gen_context(struct gen_context *gen, struct ppc_function *func)
+{
+   uint i;
+
+   memset(gen, 0, sizeof(*gen));
+   gen->f = func;
+   gen->inputs_reg = ppc_reserve_register(func, 3);   /* first function param */
+   gen->outputs_reg = ppc_reserve_register(func, 4);  /* second function param */
+   gen->temps_reg = ppc_reserve_register(func, 5);    /* ... */
+   gen->immed_reg = ppc_reserve_register(func, 6);
+   gen->const_reg = ppc_reserve_register(func, 7);
+   gen->builtins_reg = ppc_reserve_register(func, 8);
+   gen->one_vec = -1;
+   gen->bit31_vec = -1;
+   gen->offset_reg = -1;
+   gen->offset_value = -9999999;
+   for (i = 0; i < MAX_PPC_TEMPS; i++) {
+      gen->temps_map[i][0] = ppc_allocate_vec_register(gen->f);
+      gen->temps_map[i][1] = ppc_allocate_vec_register(gen->f);
+      gen->temps_map[i][2] = ppc_allocate_vec_register(gen->f);
+      gen->temps_map[i][3] = ppc_allocate_vec_register(gen->f);
+   }
+}
+
+
+/**
+ * All PPC vector load/store instructions form an effective address
+ * by adding the contents of two registers.  For example:
+ *    lvx v2,r8,r9   # v2 = memory[r8 + r9]
+ *    stvx v2,r8,r9  # memory[r8 + r9] = v2;
+ * So our lvx/stvx instructions are typically preceded by an 'li' instruction
+ * to load r9 (above) with an immediate (an offset).
+ * This code emits that 'li' instruction, but only if the offset value is
+ * different than the previous 'li'.
+ * This optimization seems to save about 10% in the instruction count.
+ * Note that we need to unconditionally emit an 'li' inside basic blocks
+ * (such as inside loops).
+ */
+static int
+emit_li_offset(struct gen_context *gen, int offset)
+{
+   if (gen->offset_reg <= 0) {
+      /* allocate a GP register for storing load/store offset */
+      gen->offset_reg = ppc_allocate_register(gen->f);
+   }
+
+   /* emit new 'li' if offset is changing */
+   if (gen->offset_value < 0 || gen->offset_value != offset) {
+      gen->offset_value = offset;
+      ppc_li(gen->f, gen->offset_reg, offset);
+   }
+
+   return gen->offset_reg;
+}
+
+
+/**
+ * Forces subsequent emit_li_offset() calls to emit an 'li'.
+ * To be called at the top of basic blocks.
+ */
+static void
+reset_li_offset(struct gen_context *gen)
+{
+   gen->offset_value = -9999999;
+}
+
+
+
+/**
+ * Load the given vector register with {value, value, value, value}.
+ * The value must be in the ppu_builtin_constants[] array.
+ * We wouldn't need this if there was a simple way to load PPC vector
+ * registers with immediate values!
+ */
+static void
+load_constant_vec(struct gen_context *gen, int dst_vec, float value)
+{
+   uint pos;
+   for (pos = 0; pos < Elements(ppc_builtin_constants); pos++) {
+      if (ppc_builtin_constants[pos] == value) {
+         int offset = pos * 4;
+         int offset_reg = emit_li_offset(gen, offset);
+
+         /* Load 4-byte word into vector register.
+          * The vector slot depends on the effective address we load from.
+          * We know that our builtins start at a 16-byte boundary so we
+          * know that 'swizzle' tells us which vector slot will have the
+          * loaded word.  The other vector slots will be undefined.
+          */
+         ppc_lvewx(gen->f, dst_vec, gen->builtins_reg, offset_reg);
+         /* splat word[pos % 4] across the vector reg */
+         ppc_vspltw(gen->f, dst_vec, dst_vec, pos % 4);
+         return;
+      }
+   }
+   assert(0 && "Need to add new constant to ppc_builtin_constants array");
+}
+
+
+/**
+ * Return index of vector register containing {1.0, 1.0, 1.0, 1.0}.
+ */
+static int
+gen_one_vec(struct gen_context *gen)
+{
+   if (gen->one_vec < 0) {
+      gen->one_vec = ppc_allocate_vec_register(gen->f);
+      load_constant_vec(gen, gen->one_vec, 1.0f);
+   }
+   return gen->one_vec;
+}
+
+/**
+ * Return index of vector register containing {1<<31, 1<<31, 1<<31, 1<<31}.
+ */
+static int
+gen_get_bit31_vec(struct gen_context *gen)
+{
+   if (gen->bit31_vec < 0) {
+      gen->bit31_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vspltisw(gen->f, gen->bit31_vec, -1);
+      ppc_vslw(gen->f, gen->bit31_vec, gen->bit31_vec, gen->bit31_vec);
+   }
+   return gen->bit31_vec;
+}
+
+
+/**
+ * Register fetch.  Return PPC vector register with result.
+ */
+static int
+emit_fetch(struct gen_context *gen,
+           const struct tgsi_full_src_register *reg,
+           const unsigned chan_index)
+{
+   uint swizzle = tgsi_util_get_full_src_register_extswizzle(reg, chan_index);
+   int dst_vec = -1;
+
+   switch (swizzle) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      switch (reg->SrcRegister.File) {
+      case TGSI_FILE_INPUT:
+         {
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            int offset_reg = emit_li_offset(gen, offset);
+            dst_vec = ppc_allocate_vec_register(gen->f);
+            ppc_lvx(gen->f, dst_vec, gen->inputs_reg, offset_reg);
+         }
+         break;
+      case TGSI_FILE_TEMPORARY:
+         if (reg->SrcRegister.Index < MAX_PPC_TEMPS) {
+            /* use PPC vec register */
+            dst_vec = gen->temps_map[reg->SrcRegister.Index][swizzle];
+         }
+         else {
+            /* use memory-based temp register "file" */
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            int offset_reg = emit_li_offset(gen, offset);
+            dst_vec = ppc_allocate_vec_register(gen->f);
+            ppc_lvx(gen->f, dst_vec, gen->temps_reg, offset_reg);
+         }
+         break;
+      case TGSI_FILE_IMMEDIATE:
+         {
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
+            int offset_reg = emit_li_offset(gen, offset);
+            dst_vec = ppc_allocate_vec_register(gen->f);
+            /* Load 4-byte word into vector register.
+             * The vector slot depends on the effective address we load from.
+             * We know that our immediates start at a 16-byte boundary so we
+             * know that 'swizzle' tells us which vector slot will have the
+             * loaded word.  The other vector slots will be undefined.
+             */
+            ppc_lvewx(gen->f, dst_vec, gen->immed_reg, offset_reg);
+            /* splat word[swizzle] across the vector reg */
+            ppc_vspltw(gen->f, dst_vec, dst_vec, swizzle);
+         }
+         break;
+      case TGSI_FILE_CONSTANT:
+         {
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
+            int offset_reg = emit_li_offset(gen, offset);
+            dst_vec = ppc_allocate_vec_register(gen->f);
+            /* Load 4-byte word into vector register.
+             * The vector slot depends on the effective address we load from.
+             * We know that our constants start at a 16-byte boundary so we
+             * know that 'swizzle' tells us which vector slot will have the
+             * loaded word.  The other vector slots will be undefined.
+             */
+            ppc_lvewx(gen->f, dst_vec, gen->const_reg, offset_reg);
+            /* splat word[swizzle] across the vector reg */
+            ppc_vspltw(gen->f, dst_vec, dst_vec, swizzle);
+         }
+         break;
+      default:
+         assert( 0 );
+      }
+      break;
+   case TGSI_EXTSWIZZLE_ZERO:
+      ppc_vzero(gen->f, dst_vec);
+      break;
+   case TGSI_EXTSWIZZLE_ONE:
+      {
+         int one_vec = gen_one_vec(gen);
+         dst_vec = ppc_allocate_vec_register(gen->f);
+         ppc_vmove(gen->f, dst_vec, one_vec);
+      }
+      break;
+   default:
+      assert( 0 );
+   }
+
+   assert(dst_vec >= 0);
+
+   {
+      uint sign_op = tgsi_util_get_full_src_register_sign_mode(reg, chan_index);
+      if (sign_op != TGSI_UTIL_SIGN_KEEP) {
+         int bit31_vec = gen_get_bit31_vec(gen);
+
+         switch (sign_op) {
+         case TGSI_UTIL_SIGN_CLEAR:
+            /* vec = vec & ~bit31 */
+            ppc_vandc(gen->f, dst_vec, dst_vec, bit31_vec);
+            break;
+         case TGSI_UTIL_SIGN_SET:
+            /* vec = vec | bit31 */
+            ppc_vor(gen->f, dst_vec, dst_vec, bit31_vec);
+            break;
+         case TGSI_UTIL_SIGN_TOGGLE:
+            /* vec = vec ^ bit31 */
+            ppc_vxor(gen->f, dst_vec, dst_vec, bit31_vec);
+            break;
+         default:
+            assert(0);
+         }
+      }
+   }
+
+   return dst_vec;
+}
+
+
+
+/**
+ * Test if two TGSI src registers refer to the same memory location.
+ * We use this to avoid redundant register loads.
+ */
+static boolean
+equal_src_locs(const struct tgsi_full_src_register *a, uint chan_a,
+               const struct tgsi_full_src_register *b, uint chan_b)
+{
+   int swz_a, swz_b;
+   int sign_a, sign_b;
+   if (a->SrcRegister.File != b->SrcRegister.File)
+      return FALSE;
+   if (a->SrcRegister.Index != b->SrcRegister.Index)
+      return FALSE;
+   swz_a = tgsi_util_get_full_src_register_extswizzle(a, chan_a);
+   swz_b = tgsi_util_get_full_src_register_extswizzle(b, chan_b);
+   if (swz_a != swz_b)
+      return FALSE;
+   sign_a = tgsi_util_get_full_src_register_sign_mode(a, chan_a);
+   sign_b = tgsi_util_get_full_src_register_sign_mode(b, chan_b);
+   if (sign_a != sign_b)
+      return FALSE;
+   return TRUE;
+}
+
+
+/**
+ * Given a TGSI src register and channel index, return the PPC vector
+ * register containing the value.  We use a cache to prevent re-loading
+ * the same register multiple times.
+ * \return index of PPC vector register with the desired src operand
+ */
+static int
+get_src_vec(struct gen_context *gen,
+            struct tgsi_full_instruction *inst, int src_reg, uint chan)
+{
+   const const struct tgsi_full_src_register *src = 
+      &inst->FullSrcRegisters[src_reg];
+   int vec;
+   uint i;
+
+   /* check the cache */
+   for (i = 0; i < gen->num_regs; i++) {
+      if (equal_src_locs(&gen->regs[i].src, gen->regs[i].chan, src, chan)) {
+         /* cache hit */
+         assert(gen->regs[i].vec >= 0);
+         return gen->regs[i].vec;
+      }
+   }
+
+   /* cache miss: allocate new vec reg and emit fetch/load code */
+   vec = emit_fetch(gen, src, chan);
+   gen->regs[gen->num_regs].src = *src;
+   gen->regs[gen->num_regs].chan = chan;
+   gen->regs[gen->num_regs].vec = vec;
+   gen->num_regs++;
+
+   assert(gen->num_regs <= Elements(gen->regs));
+
+   assert(vec >= 0);
+
+   return vec;
+}
+
+
+/**
+ * Clear the src operand cache.  To be called at the end of each emit function.
+ */
+static void
+release_src_vecs(struct gen_context *gen)
+{
+   uint i;
+   for (i = 0; i < gen->num_regs; i++) {
+      const const struct tgsi_full_src_register src = gen->regs[i].src;
+      if (!(src.SrcRegister.File == TGSI_FILE_TEMPORARY &&
+            src.SrcRegister.Index < MAX_PPC_TEMPS)) {
+         ppc_release_vec_register(gen->f, gen->regs[i].vec);
+      }
+   }
+   gen->num_regs = 0;
+}
+
+
+
+static int
+get_dst_vec(struct gen_context *gen, 
+            const struct tgsi_full_instruction *inst,
+            unsigned chan_index)
+{
+   const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[0];
+
+   if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
+       reg->DstRegister.Index < MAX_PPC_TEMPS) {
+      int vec = gen->temps_map[reg->DstRegister.Index][chan_index];
+      return vec;
+   }
+   else {
+      return ppc_allocate_vec_register(gen->f);
+   }
+}
+
+
+/**
+ * Register store.  Store 'src_vec' at location indicated by 'reg'.
+ * \param free_vec  Should the src_vec be released when done?
+ */
+static void
+emit_store(struct gen_context *gen,
+           int src_vec,
+           const struct tgsi_full_instruction *inst,
+           unsigned chan_index,
+           boolean free_vec)
+{
+   const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[0];
+
+   switch (reg->DstRegister.File) {
+   case TGSI_FILE_OUTPUT:
+      {
+         int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
+         int offset_reg = emit_li_offset(gen, offset);
+         ppc_stvx(gen->f, src_vec, gen->outputs_reg, offset_reg);
+      }
+      break;
+   case TGSI_FILE_TEMPORARY:
+      if (reg->DstRegister.Index < MAX_PPC_TEMPS) {
+         if (!free_vec) {
+            int dst_vec = gen->temps_map[reg->DstRegister.Index][chan_index];
+            if (dst_vec != src_vec)
+               ppc_vmove(gen->f, dst_vec, src_vec);
+         }
+         free_vec = FALSE;
+      }
+      else {
+         int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
+         int offset_reg = emit_li_offset(gen, offset);
+         ppc_stvx(gen->f, src_vec, gen->temps_reg, offset_reg);
+      }
+      break;
+#if 0
+   case TGSI_FILE_ADDRESS:
+      emit_addrs(
+         func,
+         xmm,
+         reg->DstRegister.Index,
+         chan_index );
+      break;
+#endif
+   default:
+      assert( 0 );
+   }
+
+#if 0
+   switch( inst->Instruction.Saturate ) {
+   case TGSI_SAT_NONE:
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      /* assert( 0 ); */
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      assert( 0 );
+      break;
+   }
+#endif
+
+   if (free_vec)
+      ppc_release_vec_register(gen->f, src_vec);
+}
+
+
+static void
+emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0, v1;
+   uint chan_index;
+
+   v0 = get_src_vec(gen, inst, 0, CHAN_X);
+   v1 = ppc_allocate_vec_register(gen->f);
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_RSQ:
+      /* v1 = 1.0 / sqrt(v0) */
+      ppc_vrsqrtefp(gen->f, v1, v0);
+      break;
+   case TGSI_OPCODE_RCP:
+      /* v1 = 1.0 / v0 */
+      ppc_vrefp(gen->f, v1, v0);
+      break;
+   default:
+      assert(0);
+   }
+
+   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+      emit_store(gen, v1, inst, chan_index, FALSE);
+   }
+
+   release_src_vecs(gen);
+   ppc_release_vec_register(gen->f, v1);
+}
+
+
+static void
+emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   uint chan_index;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      int v0 = get_src_vec(gen, inst, 0, chan_index);   /* v0 = srcreg[0] */
+      int v1 = get_dst_vec(gen, inst, chan_index);
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_ABS:
+         /* turn off the most significant bit of each vector float word */
+         {
+            int bit31_vec = gen_get_bit31_vec(gen);
+            ppc_vandc(gen->f, v1, v0, bit31_vec); /* v1 = v0 & ~bit31 */
+         }
+         break;
+      case TGSI_OPCODE_FLOOR:
+         ppc_vrfim(gen->f, v1, v0);         /* v1 = floor(v0) */
+         break;
+      case TGSI_OPCODE_FRAC:
+         ppc_vrfim(gen->f, v1, v0);      /* tmp = floor(v0) */
+         ppc_vsubfp(gen->f, v1, v0, v1); /* v1 = v0 - v1 */
+         break;
+      case TGSI_OPCODE_EXPBASE2:
+         ppc_vexptefp(gen->f, v1, v0);     /* v1 = 2^v0 */
+         break;
+      case TGSI_OPCODE_LOGBASE2:
+         /* XXX this may be broken! */
+         ppc_vlogefp(gen->f, v1, v0);      /* v1 = log2(v0) */
+         break;
+      case TGSI_OPCODE_MOV:
+      case TGSI_OPCODE_SWZ:
+         if (v0 != v1)
+            ppc_vmove(gen->f, v1, v0);
+         break;
+      default:
+         assert(0);
+      }
+      emit_store(gen, v1, inst, chan_index, TRUE);  /* store v0 */
+   }
+
+   release_src_vecs(gen);
+}
+
+
+static void
+emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int zero_vec = -1;
+   uint chan;
+
+   if (inst->Instruction.Opcode == TGSI_OPCODE_MUL) {
+      zero_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vzero(gen->f, zero_vec);
+   }
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
+      /* fetch src operands */
+      int v0 = get_src_vec(gen, inst, 0, chan);
+      int v1 = get_src_vec(gen, inst, 1, chan);
+      int v2 = get_dst_vec(gen, inst, chan);
+
+      /* emit binop */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_ADD:
+         ppc_vaddfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_SUB:
+         ppc_vsubfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_MUL:
+         ppc_vmaddfp(gen->f, v2, v0, v1, zero_vec);
+         break;
+      case TGSI_OPCODE_MIN:
+         ppc_vminfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_MAX:
+         ppc_vmaxfp(gen->f, v2, v0, v1);
+         break;
+      default:
+         assert(0);
+      }
+
+      /* store v2 */
+      emit_store(gen, v2, inst, chan, TRUE);
+   }
+
+   if (inst->Instruction.Opcode == TGSI_OPCODE_MUL)
+      ppc_release_vec_register(gen->f, zero_vec);
+
+   release_src_vecs(gen);
+}
+
+
+static void
+emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   uint chan;
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
+      /* fetch src operands */
+      int v0 = get_src_vec(gen, inst, 0, chan);
+      int v1 = get_src_vec(gen, inst, 1, chan);
+      int v2 = get_src_vec(gen, inst, 2, chan);
+      int v3 = get_dst_vec(gen, inst, chan);
+
+      /* emit ALU */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_MAD:
+         ppc_vmaddfp(gen->f, v3, v0, v1, v2);   /* v3 = v0 * v1 + v2 */
+         break;
+      case TGSI_OPCODE_LRP:
+         ppc_vsubfp(gen->f, v3, v1, v2);        /* v3 = v1 - v2 */
+         ppc_vmaddfp(gen->f, v3, v0, v3, v2);   /* v3 = v0 * v3 + v2 */
+         break;
+      default:
+         assert(0);
+      }
+
+      /* store v3 */
+      emit_store(gen, v3, inst, chan, TRUE);
+   }
+
+   release_src_vecs(gen);
+}
+
+
+/**
+ * Vector comparisons, resulting in 1.0 or 0.0 values.
+ */
+static void
+emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   uint chan;
+   int one_vec = gen_one_vec(gen);
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
+      /* fetch src operands */
+      int v0 = get_src_vec(gen, inst, 0, chan);
+      int v1 = get_src_vec(gen, inst, 1, chan);
+      int v2 = get_dst_vec(gen, inst, chan);
+      boolean complement = FALSE;
+
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_SNE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SEQ:
+         ppc_vcmpeqfpx(gen->f, v2, v0, v1); /* v2 = v0 == v1 ? ~0 : 0 */
+         break;
+
+      case TGSI_OPCODE_SGE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SLT:
+         ppc_vcmpgtfpx(gen->f, v2, v1, v0); /* v2 = v1 > v0 ? ~0 : 0 */
+         break;
+
+      case TGSI_OPCODE_SLE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SGT:
+         ppc_vcmpgtfpx(gen->f, v2, v0, v1); /* v2 = v0 > v1 ? ~0 : 0 */
+         break;
+      default:
+         assert(0);
+      }
+
+      /* v2 is now {0,0,0,0} or {~0,~0,~0,~0} */
+
+      if (complement)
+         ppc_vandc(gen->f, v2, one_vec, v2);    /* v2 = one_vec & ~v2 */
+      else
+         ppc_vand(gen->f, v2, one_vec, v2);     /* v2 = one_vec & v2 */
+
+      /* store v2 */
+      emit_store(gen, v2, inst, chan, TRUE);
+   }
+
+   release_src_vecs(gen);
+}
+
+
+static void
+emit_dotprod(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0, v1, v2;
+   uint chan_index;
+
+   v2 = ppc_allocate_vec_register(gen->f);
+
+   ppc_vxor(gen->f, v2, v2, v2);           /* v2 = {0, 0, 0, 0} */
+
+   v0 = get_src_vec(gen, inst, 0, CHAN_X); /* v0 = src0.XXXX */
+   v1 = get_src_vec(gen, inst, 1, CHAN_X); /* v1 = src1.XXXX */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
+
+   v0 = get_src_vec(gen, inst, 0, CHAN_Y); /* v0 = src0.YYYY */
+   v1 = get_src_vec(gen, inst, 1, CHAN_Y); /* v1 = src1.YYYY */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
+
+   v0 = get_src_vec(gen, inst, 0, CHAN_Z); /* v0 = src0.ZZZZ */
+   v1 = get_src_vec(gen, inst, 1, CHAN_Z); /* v1 = src1.ZZZZ */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
+
+   if (inst->Instruction.Opcode == TGSI_OPCODE_DP4) {
+      v0 = get_src_vec(gen, inst, 0, CHAN_W); /* v0 = src0.WWWW */
+      v1 = get_src_vec(gen, inst, 1, CHAN_W); /* v1 = src1.WWWW */
+      ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
+   }
+   else if (inst->Instruction.Opcode == TGSI_OPCODE_DPH) {
+      v1 = get_src_vec(gen, inst, 1, CHAN_W); /* v1 = src1.WWWW */
+      ppc_vaddfp(gen->f, v2, v2, v1);         /* v2 = v2 + v1 */
+   }
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      emit_store(gen, v2, inst, chan_index, FALSE);  /* store v2, free v2 later */
+   }
+
+   release_src_vecs(gen);
+
+   ppc_release_vec_register(gen->f, v2);
+}
+
+
+/** Approximation for vr = pow(va, vb) */
+static void
+ppc_vec_pow(struct ppc_function *f, int vr, int va, int vb)
+{
+   /* pow(a,b) ~= exp2(log2(a) * b) */
+   int t_vec = ppc_allocate_vec_register(f);
+   int zero_vec = ppc_allocate_vec_register(f);
+
+   ppc_vzero(f, zero_vec);
+
+   ppc_vlogefp(f, t_vec, va);                   /* t = log2(va) */
+   ppc_vmaddfp(f, t_vec, t_vec, vb, zero_vec);  /* t = t * vb */
+   ppc_vexptefp(f, vr, t_vec);                  /* vr = 2^t */
+
+   ppc_release_vec_register(f, t_vec);
+   ppc_release_vec_register(f, zero_vec);
+}
+
+
+static void
+emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int one_vec = gen_one_vec(gen);
+
+   /* Compute X */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+      emit_store(gen, one_vec, inst, CHAN_X, FALSE);
+   }
+
+   /* Compute Y, Z */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
+       IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+      int x_vec;
+      int zero_vec = ppc_allocate_vec_register(gen->f);
+
+      x_vec = get_src_vec(gen, inst, 0, CHAN_X);  /* x_vec = src[0].x */
+
+      ppc_vzero(gen->f, zero_vec);                /* zero = {0,0,0,0} */
+      ppc_vmaxfp(gen->f, x_vec, x_vec, zero_vec); /* x_vec = max(x_vec, 0) */
+
+      if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+         emit_store(gen, x_vec, inst, CHAN_Y, FALSE);
+      }
+
+      if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+         int y_vec, w_vec;
+         int z_vec = ppc_allocate_vec_register(gen->f);
+         int pow_vec = ppc_allocate_vec_register(gen->f);
+         int pos_vec = ppc_allocate_vec_register(gen->f);
+         int p128_vec = ppc_allocate_vec_register(gen->f);
+         int n128_vec = ppc_allocate_vec_register(gen->f);
+
+         y_vec = get_src_vec(gen, inst, 0, CHAN_Y);  /* y_vec = src[0].y */
+         ppc_vmaxfp(gen->f, y_vec, y_vec, zero_vec); /* y_vec = max(y_vec, 0) */
+
+         w_vec = get_src_vec(gen, inst, 0, CHAN_W);  /* w_vec = src[0].w */
+
+         /* clamp W to [-128, 128] */
+         load_constant_vec(gen, p128_vec, 128.0f);
+         load_constant_vec(gen, n128_vec, -128.0f);
+         ppc_vmaxfp(gen->f, w_vec, w_vec, n128_vec); /* w = max(w, -128) */
+         ppc_vminfp(gen->f, w_vec, w_vec, p128_vec); /* w = min(w, 128) */
+
+         /* if temp.x > 0
+          *    z = pow(tmp.y, tmp.w)
+          * else
+          *    z = 0.0
+          */
+         ppc_vec_pow(gen->f, pow_vec, y_vec, w_vec);      /* pow = pow(y, w) */
+         ppc_vcmpgtfpx(gen->f, pos_vec, x_vec, zero_vec); /* pos = x > 0 */
+         ppc_vand(gen->f, z_vec, pow_vec, pos_vec);       /* z = pow & pos */
+
+         emit_store(gen, z_vec, inst, CHAN_Z, FALSE);
+
+         ppc_release_vec_register(gen->f, z_vec);
+         ppc_release_vec_register(gen->f, pow_vec);
+         ppc_release_vec_register(gen->f, pos_vec);
+         ppc_release_vec_register(gen->f, p128_vec);
+         ppc_release_vec_register(gen->f, n128_vec);
+      }
+
+      ppc_release_vec_register(gen->f, zero_vec);
+   }
+
+   /* Compute W */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
+      emit_store(gen, one_vec, inst, CHAN_W, FALSE);
+   }
+
+   release_src_vecs(gen);
+}
+
+
+static void
+emit_exp(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   const int one_vec = gen_one_vec(gen);
+   int src_vec;
+
+   /* get src arg */
+   src_vec = get_src_vec(gen, inst, 0, CHAN_X);
+
+   /* Compute X = 2^floor(src) */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+      int dst_vec = get_dst_vec(gen, inst, CHAN_X);
+      int tmp_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vrfim(gen->f, tmp_vec, src_vec);             /* tmp = floor(src); */
+      ppc_vexptefp(gen->f, dst_vec, tmp_vec);          /* dst = 2 ^ tmp */
+      emit_store(gen, dst_vec, inst, CHAN_X, TRUE);
+      ppc_release_vec_register(gen->f, tmp_vec);
+   }
+
+   /* Compute Y = src - floor(src) */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+      int dst_vec = get_dst_vec(gen, inst, CHAN_Y);
+      int tmp_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vrfim(gen->f, tmp_vec, src_vec);             /* tmp = floor(src); */
+      ppc_vsubfp(gen->f, dst_vec, src_vec, tmp_vec);   /* dst = src - tmp */
+      emit_store(gen, dst_vec, inst, CHAN_Y, TRUE);
+      ppc_release_vec_register(gen->f, tmp_vec);
+   }
+
+   /* Compute Z = RoughApprox2ToX(src) */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+      int dst_vec = get_dst_vec(gen, inst, CHAN_Z);
+      ppc_vexptefp(gen->f, dst_vec, src_vec);          /* dst = 2 ^ src */
+      emit_store(gen, dst_vec, inst, CHAN_Z, TRUE);
+   }
+
+   /* Compute W = 1.0 */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
+      emit_store(gen, one_vec, inst, CHAN_W, FALSE);
+   }
+
+   release_src_vecs(gen);
+}
+
+
+static void
+emit_log(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   const int bit31_vec = gen_get_bit31_vec(gen);
+   const int one_vec = gen_one_vec(gen);
+   int src_vec, abs_vec;
+
+   /* get src arg */
+   src_vec = get_src_vec(gen, inst, 0, CHAN_X);
+
+   /* compute abs(src) */
+   abs_vec = ppc_allocate_vec_register(gen->f);
+   ppc_vandc(gen->f, abs_vec, src_vec, bit31_vec);     /* abs = src & ~bit31 */
+
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) &&
+       IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+
+      /* compute tmp = floor(log2(abs)) */
+      int tmp_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vlogefp(gen->f, tmp_vec, abs_vec);           /* tmp = log2(abs) */
+      ppc_vrfim(gen->f, tmp_vec, tmp_vec);             /* tmp = floor(tmp); */
+
+      /* Compute X = tmp */
+      if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+         emit_store(gen, tmp_vec, inst, CHAN_X, FALSE);
+      }
+      
+      /* Compute Y = abs / 2^tmp */
+      if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+         const int zero_vec = ppc_allocate_vec_register(gen->f);
+         ppc_vzero(gen->f, zero_vec);
+         ppc_vexptefp(gen->f, tmp_vec, tmp_vec);       /* tmp = 2 ^ tmp */
+         ppc_vrefp(gen->f, tmp_vec, tmp_vec);          /* tmp = 1 / tmp */
+         /* tmp = abs * tmp + zero */
+         ppc_vmaddfp(gen->f, tmp_vec, abs_vec, tmp_vec, zero_vec);
+         emit_store(gen, tmp_vec, inst, CHAN_Y, FALSE);
+         ppc_release_vec_register(gen->f, zero_vec);
+      }
+
+      ppc_release_vec_register(gen->f, tmp_vec);
+   }
+
+   /* Compute Z = RoughApproxLog2(abs) */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+      int dst_vec = get_dst_vec(gen, inst, CHAN_Z);
+      ppc_vlogefp(gen->f, dst_vec, abs_vec);           /* dst = log2(abs) */
+      emit_store(gen, dst_vec, inst, CHAN_Z, TRUE);
+   }
+
+   /* Compute W = 1.0 */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
+      emit_store(gen, one_vec, inst, CHAN_W, FALSE);
+   }
+
+   ppc_release_vec_register(gen->f, abs_vec);
+   release_src_vecs(gen);
+}
+
+
+static void
+emit_pow(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int s0_vec = get_src_vec(gen, inst, 0, CHAN_X);
+   int s1_vec = get_src_vec(gen, inst, 1, CHAN_X);
+   int pow_vec = ppc_allocate_vec_register(gen->f);
+   int chan;
+
+   ppc_vec_pow(gen->f, pow_vec, s0_vec, s1_vec);
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
+      emit_store(gen, pow_vec, inst, chan, FALSE);
+   }
+
+   ppc_release_vec_register(gen->f, pow_vec);
+
+   release_src_vecs(gen);
+}
+
+
+static void
+emit_xpd(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int x0_vec, y0_vec, z0_vec;
+   int x1_vec, y1_vec, z1_vec;
+   int zero_vec, tmp_vec;
+   int tmp2_vec;
+
+   zero_vec = ppc_allocate_vec_register(gen->f);
+   ppc_vzero(gen->f, zero_vec);
+
+   tmp_vec = ppc_allocate_vec_register(gen->f);
+   tmp2_vec = ppc_allocate_vec_register(gen->f);
+
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
+       IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+      x0_vec = get_src_vec(gen, inst, 0, CHAN_X);
+      x1_vec = get_src_vec(gen, inst, 1, CHAN_X);
+   }
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
+       IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+      y0_vec = get_src_vec(gen, inst, 0, CHAN_Y);
+      y1_vec = get_src_vec(gen, inst, 1, CHAN_Y);
+   }
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
+       IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+      z0_vec = get_src_vec(gen, inst, 0, CHAN_Z);
+      z1_vec = get_src_vec(gen, inst, 1, CHAN_Z);
+   }
+
+   IF_IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) {
+      /* tmp = y0 * z1 */
+      ppc_vmaddfp(gen->f, tmp_vec, y0_vec, z1_vec, zero_vec);
+      /* tmp = tmp - z0 * y1*/
+      ppc_vnmsubfp(gen->f, tmp_vec, tmp_vec, z0_vec, y1_vec);
+      emit_store(gen, tmp_vec, inst, CHAN_X, FALSE);
+   }
+   IF_IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) {
+      /* tmp = z0 * x1 */
+      ppc_vmaddfp(gen->f, tmp_vec, z0_vec, x1_vec, zero_vec);
+      /* tmp = tmp - x0 * z1 */
+      ppc_vnmsubfp(gen->f, tmp_vec, tmp_vec, x0_vec, z1_vec);
+      emit_store(gen, tmp_vec, inst, CHAN_Y, FALSE);
+   }
+   IF_IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) {
+      /* tmp = x0 * y1 */
+      ppc_vmaddfp(gen->f, tmp_vec, x0_vec, y1_vec, zero_vec);
+      /* tmp = tmp - y0 * x1 */
+      ppc_vnmsubfp(gen->f, tmp_vec, tmp_vec, y0_vec, x1_vec);
+      emit_store(gen, tmp_vec, inst, CHAN_Z, FALSE);
+   }
+   /* W is undefined */
+
+   ppc_release_vec_register(gen->f, tmp_vec);
+   ppc_release_vec_register(gen->f, zero_vec);
+   release_src_vecs(gen);
+}
+
+static int
+emit_instruction(struct gen_context *gen,
+                 struct tgsi_full_instruction *inst)
+{
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_SWZ:
+   case TGSI_OPCODE_ABS:
+   case TGSI_OPCODE_FLOOR:
+   case TGSI_OPCODE_FRAC:
+   case TGSI_OPCODE_EXPBASE2:
+   case TGSI_OPCODE_LOGBASE2:
+      emit_unaryop(gen, inst);
+      break;
+   case TGSI_OPCODE_RSQ:
+   case TGSI_OPCODE_RCP:
+      emit_scalar_unaryop(gen, inst);
+      break;
+   case TGSI_OPCODE_ADD:
+   case TGSI_OPCODE_SUB:
+   case TGSI_OPCODE_MUL:
+   case TGSI_OPCODE_MIN:
+   case TGSI_OPCODE_MAX:
+      emit_binop(gen, inst);
+      break;
+   case TGSI_OPCODE_SEQ:
+   case TGSI_OPCODE_SNE:
+   case TGSI_OPCODE_SLT:
+   case TGSI_OPCODE_SGT:
+   case TGSI_OPCODE_SLE:
+   case TGSI_OPCODE_SGE:
+      emit_inequality(gen, inst);
+      break;
+   case TGSI_OPCODE_MAD:
+   case TGSI_OPCODE_LRP:
+      emit_triop(gen, inst);
+      break;
+   case TGSI_OPCODE_DP3:
+   case TGSI_OPCODE_DP4:
+   case TGSI_OPCODE_DPH:
+      emit_dotprod(gen, inst);
+      break;
+   case TGSI_OPCODE_LIT:
+      emit_lit(gen, inst);
+      break;
+   case TGSI_OPCODE_LOG:
+      emit_log(gen, inst);
+      break;
+   case TGSI_OPCODE_EXP:
+      emit_exp(gen, inst);
+      break;
+   case TGSI_OPCODE_POW:
+      emit_pow(gen, inst);
+      break;
+   case TGSI_OPCODE_XPD:
+      emit_xpd(gen, inst);
+      break;
+   case TGSI_OPCODE_END:
+      /* normal end */
+      return 1;
+   default:
+      return 0;
+   }
+   return 1;
+}
+
+
+static void
+emit_declaration(
+   struct ppc_function *func,
+   struct tgsi_full_declaration *decl )
+{
+   if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+#if 0
+      unsigned first, last, mask;
+      unsigned i, j;
+
+      first = decl->DeclarationRange.First;
+      last = decl->DeclarationRange.Last;
+      mask = decl->Declaration.UsageMask;
+
+      for( i = first; i <= last; i++ ) {
+         for( j = 0; j < NUM_CHANNELS; j++ ) {
+            if( mask & (1 << j) ) {
+               switch( decl->Declaration.Interpolate ) {
+               case TGSI_INTERPOLATE_CONSTANT:
+                  emit_coef_a0( func, 0, i, j );
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               case TGSI_INTERPOLATE_LINEAR:
+                  emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
+                  emit_coef_dadx( func, 1, i, j );
+                  emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
+                  emit_coef_dady( func, 3, i, j );
+                  emit_mul( func, 0, 1 );    /* x * dadx */
+                  emit_coef_a0( func, 4, i, j );
+                  emit_mul( func, 2, 3 );    /* y * dady */
+                  emit_add( func, 0, 4 );    /* x * dadx + a0 */
+                  emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               case TGSI_INTERPOLATE_PERSPECTIVE:
+                  emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
+                  emit_coef_dadx( func, 1, i, j );
+                  emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
+                  emit_coef_dady( func, 3, i, j );
+                  emit_mul( func, 0, 1 );    /* x * dadx */
+                  emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
+                  emit_coef_a0( func, 5, i, j );
+                  emit_rcp( func, 4, 4 );    /* 1.0 / w */
+                  emit_mul( func, 2, 3 );    /* y * dady */
+                  emit_add( func, 0, 5 );    /* x * dadx + a0 */
+                  emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
+                  emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               default:
+                  assert( 0 );
+		  break;
+               }
+            }
+         }
+      }
+#endif
+   }
+}
+
+
+
+static void
+emit_prologue(struct ppc_function *func)
+{
+   /* XXX set up stack frame */
+}
+
+
+static void
+emit_epilogue(struct ppc_function *func)
+{
+   ppc_return(func);
+   /* XXX restore prev stack frame */
+   debug_printf("PPC: Emitted %u instructions\n", func->num_inst);
+}
+
+
+
+/**
+ * Translate a TGSI vertex/fragment shader to PPC code.
+ *
+ * \param tokens  the TGSI input shader
+ * \param func  the output PPC code/function
+ * \param immediates  buffer to place immediates, later passed to PPC func
+ * \return TRUE for success, FALSE if translation failed
+ */
+boolean
+tgsi_emit_ppc(const struct tgsi_token *tokens,
+              struct ppc_function *func,
+              float (*immediates)[4],
+              boolean do_swizzles )
+{
+   static int use_ppc_asm = -1;
+   struct tgsi_parse_context parse;
+   /*boolean instruction_phase = FALSE;*/
+   unsigned ok = 1;
+   uint num_immediates = 0;
+   struct gen_context gen;
+
+   if (use_ppc_asm < 0) {
+      /* If GALLIUM_NOPPC is set, don't use PPC codegen */
+      use_ppc_asm = !debug_get_bool_option("GALLIUM_NOPPC", FALSE);
+   }
+   if (!use_ppc_asm)
+      return FALSE;
+
+   if (0) {
+      debug_printf("\n********* TGSI->PPC ********\n");
+      tgsi_dump(tokens, 0);
+   }
+
+   util_init_math();
+
+   init_gen_context(&gen, func);
+
+   emit_prologue(func);
+
+   tgsi_parse_init( &parse, tokens );
+
+   while (!tgsi_parse_end_of_tokens(&parse) && ok) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+            emit_declaration(func, &parse.FullToken.FullDeclaration );
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         ok = emit_instruction(&gen, &parse.FullToken.FullInstruction);
+
+	 if (!ok) {
+	    debug_printf("failed to translate tgsi opcode %d to PPC (%s)\n", 
+			 parse.FullToken.FullInstruction.Instruction.Opcode,
+                         parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
+                         "vertex shader" : "fragment shader");
+	 }
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         /* splat each immediate component into a float[4] vector for SoA */
+         {
+            const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
+            uint i;
+            assert(size <= 4);
+            assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
+            for (i = 0; i < size; i++) {
+               immediates[num_immediates][i] =
+		  parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
+            }
+            num_immediates++;
+         }
+         break;
+
+      default:
+	 ok = 0;
+         assert( 0 );
+      }
+   }
+
+   emit_epilogue(func);
+
+   tgsi_parse_free( &parse );
+
+   if (ppc_num_instructions(func) == 0) {
+      /* ran out of memory for instructions */
+      ok = FALSE;
+   }
+
+   if (!ok)
+      debug_printf("TGSI->PPC translation failed\n");
+
+   return ok;
+}
+
+#endif /* PIPE_ARCH_PPC */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.h b/src/gallium/auxiliary/tgsi/tgsi_ppc.h
new file mode 100644
index 00000000000..829ec075e7f
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.h
@@ -0,0 +1,51 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_PPC_H
+#define TGSI_PPC_H
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+struct tgsi_token;
+struct ppc_function;
+
+extern const float ppc_builtin_constants[];
+
+
+boolean
+tgsi_emit_ppc(const struct tgsi_token *tokens,
+              struct ppc_function *function,
+              float (*immediates)[4],
+              boolean do_swizzles);
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TGSI_PPC_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 4d59106dbfa..8dfd2ced089 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -25,9 +25,16 @@
  * 
  **************************************************************************/
 
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_X86)
+
 #include "pipe/p_debug.h"
 #include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
+#if defined(PIPE_ARCH_SSE)
+#include "util/u_sse.h"
+#endif
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi_exec.h"
@@ -35,8 +42,6 @@
 
 #include "rtasm/rtasm_x86sse.h"
 
-#ifdef PIPE_ARCH_X86
-
 /* for 1/sqrt()
  *
  * This costs about 100fps (close to 10%) in gears:
@@ -509,10 +514,31 @@ emit_coef_dady(
  * Function call helpers.
  */
 
+/**
+ * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be 
+ * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
+ * that the stack pointer is 16 byte aligned, as expected.
+ */
 static void
-emit_push_gp(
-   struct x86_function *func )
+emit_func_call_dst(
+   struct x86_function *func,
+   unsigned xmm_save,
+   unsigned xmm_dst,
+   void (PIPE_CDECL *code)() )
 {
+   struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
+   unsigned i, n;
+   unsigned xmm_mask;
+   
+   /* Bitmask of the xmm registers to save */
+   xmm_mask = (1 << xmm_save) - 1;
+   xmm_mask &= ~(1 << xmm_dst);
+
+   sse_movaps(
+      func,
+      get_temp( TEMP_R0, 0 ),
+      make_xmm( xmm_dst ) );
+
    x86_push(
       func,
       x86_make_reg( file_REG32, reg_AX) );
@@ -522,12 +548,49 @@ emit_push_gp(
    x86_push(
       func,
       x86_make_reg( file_REG32, reg_DX) );
-}
+   
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_mask & (1 << i))
+         ++n;
+   
+   x86_sub_imm(
+      func, 
+      x86_make_reg( file_REG32, reg_SP ),
+      n*16);
+
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_mask & (1 << i)) {
+         sse_movups(
+            func,
+            x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
+            make_xmm( i ) );
+         ++n;
+      }
+   
+   x86_lea(
+      func,
+      ecx,
+      get_temp( TEMP_R0, 0 ) );
+   
+   x86_push( func, ecx );
+   x86_mov_reg_imm( func, ecx, (unsigned long) code );
+   x86_call( func, ecx );
+   x86_pop(func, ecx );
+   
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_mask & (1 << i)) {
+         sse_movups(
+            func,
+            make_xmm( i ),
+            x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
+         ++n;
+      }
+   
+   x86_add_imm(
+      func, 
+      x86_make_reg( file_REG32, reg_SP ),
+      n*16);
 
-static void
-x86_pop_gp(
-   struct x86_function *func )
-{
    /* Restore GP registers in a reverse order.
     */
    x86_pop(
@@ -539,39 +602,6 @@ x86_pop_gp(
    x86_pop(
       func,
       x86_make_reg( file_REG32, reg_AX) );
-}
-
-static void
-emit_func_call_dst(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   void (PIPE_CDECL *code)() )
-{
-   sse_movaps(
-      func,
-      get_temp( TEMP_R0, 0 ),
-      make_xmm( xmm_dst ) );
-
-   emit_push_gp(
-      func );
-
-   {
-      struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
-
-      x86_lea(
-         func,
-         ecx,
-         get_temp( TEMP_R0, 0 ) );
-
-      x86_push( func, ecx );
-      x86_mov_reg_imm( func, ecx, (unsigned long) code );
-      x86_call( func, ecx );
-      x86_pop(func, ecx ); 
-   }
-
-
-   x86_pop_gp(
-      func );
 
    sse_movaps(
       func,
@@ -582,6 +612,7 @@ emit_func_call_dst(
 static void
 emit_func_call_dst_src(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst,
    unsigned xmm_src,
    void (PIPE_CDECL *code)() )
@@ -593,10 +624,119 @@ emit_func_call_dst_src(
 
    emit_func_call_dst(
       func,
+      xmm_save,
       xmm_dst,
       code );
 }
 
+
+#if defined(PIPE_ARCH_SSE)
+
+/*
+ * Fast SSE2 implementation of special math functions.
+ */
+
+#define POLY0(x, c0) _mm_set1_ps(c0)
+#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
+#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+#define EXP_POLY_DEGREE 3
+#define LOG_POLY_DEGREE 5
+
+/**
+ * See http://www.devmaster.net/forums/showthread.php?p=43580
+ */
+static INLINE __m128 
+exp2f4(__m128 x)
+{
+   __m128i ipart;
+   __m128 fpart, expipart, expfpart;
+
+   x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
+   x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
+
+   /* ipart = int(x - 0.5) */
+   ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
+
+   /* fpart = x - ipart */
+   fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
+
+   /* expipart = (float) (1 << ipart) */
+   expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
+
+   /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
+#if EXP_POLY_DEGREE == 5
+   expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
+#elif EXP_POLY_DEGREE == 4
+   expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
+#elif EXP_POLY_DEGREE == 3
+   expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
+#elif EXP_POLY_DEGREE == 2
+   expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
+#else
+#error
+#endif
+
+   return _mm_mul_ps(expipart, expfpart);
+}
+
+
+/**
+ * See http://www.devmaster.net/forums/showthread.php?p=43580
+ */
+static INLINE __m128 
+log2f4(__m128 x)
+{
+   __m128i expmask = _mm_set1_epi32(0x7f800000);
+   __m128i mantmask = _mm_set1_epi32(0x007fffff);
+   __m128 one = _mm_set1_ps(1.0f);
+
+   __m128i i = _mm_castps_si128(x);
+
+   /* exp = (float) exponent(x) */
+   __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
+
+   /* mant = (float) mantissa(x) */
+   __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
+
+   __m128 logmant;
+
+   /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ 
+    * These coefficients can be generate with 
+    * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
+    */
+#if LOG_POLY_DEGREE == 6
+   logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
+#elif LOG_POLY_DEGREE == 5
+   logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+#elif LOG_POLY_DEGREE == 4
+   logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+#elif LOG_POLY_DEGREE == 3
+   logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+#else
+#error
+#endif
+
+   /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
+   logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
+
+   return _mm_add_ps(logmant, exp);
+}
+
+
+static INLINE __m128
+powf4(__m128 x, __m128 y)
+{
+   return exp2f4(_mm_mul_ps(log2f4(x), y));
+}
+
+#endif /* PIPE_ARCH_SSE */
+
+
+
 /**
  * Low-level instruction translators.
  */
@@ -639,38 +779,42 @@ cos4f(
 static void
 emit_cos(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst )
 {
    emit_func_call_dst(
       func,
+      xmm_save, 
       xmm_dst,
       cos4f );
 }
 
 static void PIPE_CDECL
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
+__attribute__((force_align_arg_pointer))
+#endif
 ex24f(
    float *store )
 {
-#if FAST_MATH
+#if defined(PIPE_ARCH_SSE)
+   _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
+#else
    store[0] = util_fast_exp2( store[0] );
    store[1] = util_fast_exp2( store[1] );
    store[2] = util_fast_exp2( store[2] );
    store[3] = util_fast_exp2( store[3] );
-#else
-   store[0] = powf( 2.0f, store[0] );
-   store[1] = powf( 2.0f, store[1] );
-   store[2] = powf( 2.0f, store[2] );
-   store[3] = powf( 2.0f, store[3] );
 #endif
 }
 
 static void
 emit_ex2(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst )
 {
    emit_func_call_dst(
       func,
+      xmm_save,
       xmm_dst,
       ex24f );
 }
@@ -710,10 +854,12 @@ flr4f(
 static void
 emit_flr(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst )
 {
    emit_func_call_dst(
       func,
+      xmm_save,
       xmm_dst,
       flr4f );
 }
@@ -731,31 +877,42 @@ frc4f(
 static void
 emit_frc(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst )
 {
    emit_func_call_dst(
       func,
+      xmm_save,
       xmm_dst,
       frc4f );
 }
 
 static void PIPE_CDECL
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
+__attribute__((force_align_arg_pointer))
+#endif
 lg24f(
    float *store )
 {
+#if defined(PIPE_ARCH_SSE)
+   _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
+#else
    store[0] = util_fast_log2( store[0] );
    store[1] = util_fast_log2( store[1] );
    store[2] = util_fast_log2( store[2] );
    store[3] = util_fast_log2( store[3] );
+#endif
 }
 
 static void
 emit_lg2(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst )
 {
    emit_func_call_dst(
       func,
+      xmm_save,
       xmm_dst,
       lg24f );
 }
@@ -797,30 +954,32 @@ emit_neg(
 }
 
 static void PIPE_CDECL
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
+__attribute__((force_align_arg_pointer))
+#endif
 pow4f(
    float *store )
 {
-#if FAST_MATH
+#if defined(PIPE_ARCH_SSE)
+   _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
+#else
    store[0] = util_fast_pow( store[0], store[4] );
    store[1] = util_fast_pow( store[1], store[5] );
    store[2] = util_fast_pow( store[2], store[6] );
    store[3] = util_fast_pow( store[3], store[7] );
-#else
-   store[0] = powf( store[0], store[4] );
-   store[1] = powf( store[1], store[5] );
-   store[2] = powf( store[2], store[6] );
-   store[3] = powf( store[3], store[7] );
 #endif
 }
 
 static void
 emit_pow(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst,
    unsigned xmm_src )
 {
    emit_func_call_dst_src(
       func,
+      xmm_save,
       xmm_dst,
       xmm_src,
       pow4f );
@@ -913,10 +1072,12 @@ sin4f(
 
 static void
 emit_sin (struct x86_function *func,
+          unsigned xmm_save, 
           unsigned xmm_dst)
 {
    emit_func_call_dst(
       func,
+      xmm_save,
       xmm_dst,
       sin4f );
 }
@@ -1336,7 +1497,7 @@ emit_instruction(
                get_temp(
                   TGSI_EXEC_TEMP_MINUS_128_I,
                   TGSI_EXEC_TEMP_MINUS_128_C ) );
-            emit_pow( func, 1, 2 );
+            emit_pow( func, 3, 1, 2 );
             FETCH( func, *inst, 0, 0, CHAN_X );
             sse_xorps(
                func,
@@ -1382,11 +1543,11 @@ emit_instruction(
          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
             emit_MOV( func, 1, 0 );
-            emit_flr( func, 1 );
+            emit_flr( func, 2, 1 );
             /* dst.x = ex2(floor(src.x)) */
             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
                emit_MOV( func, 2, 1 );
-               emit_ex2( func, 2 );
+               emit_ex2( func, 3, 2 );
                STORE( func, *inst, 2, 0, CHAN_X );
             }
             /* dst.y = src.x - floor(src.x) */
@@ -1398,7 +1559,7 @@ emit_instruction(
          }
          /* dst.z = ex2(src.x) */
          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-            emit_ex2( func, 0 );
+            emit_ex2( func, 3, 0 );
             STORE( func, *inst, 0, 0, CHAN_Z );
          }
       }
@@ -1416,21 +1577,21 @@ emit_instruction(
          FETCH( func, *inst, 0, 0, CHAN_X );
          emit_abs( func, 0 );
          emit_MOV( func, 1, 0 );
-         emit_lg2( func, 1 );
+         emit_lg2( func, 2, 1 );
          /* dst.z = lg2(abs(src.x)) */
          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
             STORE( func, *inst, 1, 0, CHAN_Z );
          }
          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-            emit_flr( func, 1 );
+            emit_flr( func, 2, 1 );
             /* dst.x = floor(lg2(abs(src.x))) */
             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
                STORE( func, *inst, 1, 0, CHAN_X );
             }
             /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-               emit_ex2( func, 1 );
+               emit_ex2( func, 2, 1 );
                emit_rcp( func, 1, 1 );
                emit_mul( func, 0, 1 );
                STORE( func, *inst, 0, 0, CHAN_Y );
@@ -1605,7 +1766,18 @@ emit_instruction(
 
    case TGSI_OPCODE_DOT2ADD:
    /* TGSI_OPCODE_DP2A */
-      return 0;
+      FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
+      FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
+      emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
+      FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
+      FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
+      emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
+      emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
+      FETCH( func, *inst, 1, 2, CHAN_X );  /* xmm1 = src[2].x */
+      emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
+      }
       break;
 
    case TGSI_OPCODE_INDEX:
@@ -1620,7 +1792,7 @@ emit_instruction(
    /* TGSI_OPCODE_FRC */
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( func, *inst, 0, 0, chan_index );
-         emit_frc( func, 0 );
+         emit_frc( func, 0, 0 );
          STORE( func, *inst, 0, 0, chan_index );
       }
       break;
@@ -1633,7 +1805,7 @@ emit_instruction(
    /* TGSI_OPCODE_FLR */
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( func, *inst, 0, 0, chan_index );
-         emit_flr( func, 0 );
+         emit_flr( func, 0, 0 );
          STORE( func, *inst, 0, 0, chan_index );
       }
       break;
@@ -1645,7 +1817,7 @@ emit_instruction(
    case TGSI_OPCODE_EXPBASE2:
    /* TGSI_OPCODE_EX2 */
       FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_ex2( func, 0 );
+      emit_ex2( func, 0, 0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( func, *inst, 0, 0, chan_index );
       }
@@ -1654,7 +1826,7 @@ emit_instruction(
    case TGSI_OPCODE_LOGBASE2:
    /* TGSI_OPCODE_LG2 */
       FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_lg2( func, 0 );
+      emit_lg2( func, 0, 0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( func, *inst, 0, 0, chan_index );
       }
@@ -1664,7 +1836,7 @@ emit_instruction(
    /* TGSI_OPCODE_POW */
       FETCH( func, *inst, 0, 0, CHAN_X );
       FETCH( func, *inst, 1, 1, CHAN_X );
-      emit_pow( func, 0, 1 );
+      emit_pow( func, 0, 0, 1 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( func, *inst, 0, 0, chan_index );
       }
@@ -1755,7 +1927,7 @@ emit_instruction(
 
    case TGSI_OPCODE_COS:
       FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_cos( func, 0 );
+      emit_cos( func, 0, 0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( func, *inst, 0, 0, chan_index );
       }
@@ -1814,7 +1986,7 @@ emit_instruction(
 
    case TGSI_OPCODE_SIN:
       FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_sin( func, 0 );
+      emit_sin( func, 0, 0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( func, *inst, 0, 0, chan_index );
       }
@@ -1908,12 +2080,12 @@ emit_instruction(
    case TGSI_OPCODE_SCS:
       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
          FETCH( func, *inst, 0, 0, CHAN_X );
-         emit_cos( func, 0 );
+         emit_cos( func, 0, 0 );
          STORE( func, *inst, 0, 0, CHAN_X );
       }
       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
          FETCH( func, *inst, 0, 0, CHAN_X );
-         emit_sin( func, 0 );
+         emit_sin( func, 0, 0 );
          STORE( func, *inst, 0, 0, CHAN_Y );
       }
       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
@@ -1939,7 +2111,39 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_NRM:
-      return 0;
+      /* fall-through */
+   case TGSI_OPCODE_NRM4:
+      /* 3 or 4-component normalization */
+      {
+         uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
+         /* note: cannot use xmm regs 2/3 here (see emit_rsqrt() above) */
+         FETCH( func, *inst, 4, 0, CHAN_X );    /* xmm4 = src[0].x */
+         FETCH( func, *inst, 5, 0, CHAN_Y );    /* xmm5 = src[0].y */
+         FETCH( func, *inst, 6, 0, CHAN_Z );    /* xmm6 = src[0].z */
+         if (dims == 4) {
+            FETCH( func, *inst, 7, 0, CHAN_W ); /* xmm7 = src[0].w */
+         }
+         emit_MOV( func, 0, 4 );                /* xmm0 = xmm3 */
+         emit_mul( func, 0, 4 );                /* xmm0 *= xmm3 */
+         emit_MOV( func, 1, 5 );                /* xmm1 = xmm4 */
+         emit_mul( func, 1, 5 );                /* xmm1 *= xmm4 */
+         emit_add( func, 0, 1 );                /* xmm0 += xmm1 */
+         emit_MOV( func, 1, 6 );                /* xmm1 = xmm5 */
+         emit_mul( func, 1, 6 );                /* xmm1 *= xmm5 */
+         emit_add( func, 0, 1 );                /* xmm0 += xmm1 */
+         if (dims == 4) {
+            emit_MOV( func, 1, 7 );             /* xmm1 = xmm7 */
+            emit_mul( func, 1, 7 );             /* xmm1 *= xmm7 */
+            emit_add( func, 0, 0 );             /* xmm0 += xmm1 */
+         }
+         emit_rsqrt( func, 1, 0 );              /* xmm1 = 1/sqrt(xmm0) */
+         FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+            if (chan_index < dims) {
+               emit_mul( func, 4+chan_index, 1); /* xmm[4+ch] *= xmm1 */
+               STORE( func, *inst, 4+chan_index, 0, chan_index );
+            }
+         }
+      }
       break;
 
    case TGSI_OPCODE_DIV:
@@ -1947,7 +2151,16 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_DP2:
-      return 0;
+      FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
+      FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
+      emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
+      FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
+      FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
+      emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
+      emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
+      }
       break;
 
    case TGSI_OPCODE_TXL:
diff --git a/src/gallium/auxiliary/util/Makefile b/src/gallium/auxiliary/util/Makefile
index d3951e4e7d7..f611f82773a 100644
--- a/src/gallium/auxiliary/util/Makefile
+++ b/src/gallium/auxiliary/util/Makefile
@@ -10,6 +10,8 @@ C_SOURCES = \
 	u_gen_mipmap.c \
 	u_handle_table.c \
 	u_hash_table.c \
+	u_keymap.c \
+	u_linear.c \
 	u_math.c \
 	u_mm.c \
 	u_rect.c \
diff --git a/src/gallium/auxiliary/util/SConscript b/src/gallium/auxiliary/util/SConscript
index e65c17b1cc8..8a04955a16e 100644
--- a/src/gallium/auxiliary/util/SConscript
+++ b/src/gallium/auxiliary/util/SConscript
@@ -11,13 +11,14 @@ util = env.ConvenienceLibrary(
 		'u_gen_mipmap.c',
 		'u_handle_table.c',
 		'u_hash_table.c',
+		'u_keymap.c',
 		'u_math.c',
 		'u_mm.c',
 		'u_rect.c',
 		'u_simple_shaders.c',
 		'u_snprintf.c',
-        'u_stream_stdc.c',
-        'u_stream_wd.c',
+		'u_stream_stdc.c',
+		'u_stream_wd.c',
 		'u_tile.c',
 		'u_time.c',
 	])
diff --git a/src/gallium/auxiliary/util/p_debug.c b/src/gallium/auxiliary/util/p_debug.c
index 6ff3e6e0a62..0d019808b09 100644
--- a/src/gallium/auxiliary/util/p_debug.c
+++ b/src/gallium/auxiliary/util/p_debug.c
@@ -36,6 +36,13 @@
 #include <windows.h>
 #include <winddi.h>
 
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+
+#include <stdio.h> 
+#include <stdlib.h> 
+#include <windows.h> 
+#include <types.h> 
+
 #elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
 
 #ifndef WIN32_LEAN_AND_MEAN
@@ -98,7 +105,35 @@ void _debug_vprintf(const char *format, va_list ap)
       OutputDebugStringA(buf);
       buf[0] = '\0';
    }
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE) || defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT) 
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+   wchar_t *wide_format;
+   long wide_str_len;   
+   char buf[512];   
+   int ret;   
+#if (_WIN32_WCE < 600)
+   ret = vsprintf(buf, format, ap);   
+   if(ret < 0){   
+       sprintf(buf, "Cant handle debug print!");   
+       ret = 25;
+   }
+#else
+   ret = vsprintf_s(buf, 512, format, ap);   
+   if(ret < 0){   
+       sprintf_s(buf, 512, "Cant handle debug print!");   
+       ret = 25;
+   }
+#endif
+   buf[ret] = '\0';   
+   /* Format is ascii - needs to be converted to wchar_t for printing */   
+   wide_str_len = MultiByteToWideChar(CP_ACP, 0, (const char *) buf, -1, NULL, 0);   
+   wide_format = (wchar_t *) malloc((wide_str_len+1) * sizeof(wchar_t));   
+   if (wide_format) {   
+      MultiByteToWideChar(CP_ACP, 0, (const char *) buf, -1,   
+            wide_format, wide_str_len);   
+      NKDbgPrintfW(wide_format, wide_format);   
+      free(wide_format);   
+   } 
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
    /* TODO */
 #else /* !PIPE_SUBSYSTEM_WINDOWS */
 #ifdef DEBUG
@@ -308,6 +343,13 @@ debug_get_flags_option(const char *name,
    str = _debug_get_option(name);
    if(!str)
       result = dfault;
+   else if (!util_strcmp(str, "help")) {
+      result = dfault;
+      while (flags->name) {
+         debug_printf("%s: help for %s: %s [0x%lx]\n", __FUNCTION__, name, flags->name, flags->value);
+         flags++;
+      }
+   }
    else {
       result = 0;
       while( flags->name ) {
@@ -317,7 +359,12 @@ debug_get_flags_option(const char *name,
       }
    }
 
-   debug_printf("%s: %s = 0x%lx\n", __FUNCTION__, name, result);
+   if (str) {
+      debug_printf("%s: %s = 0x%lx (%s)\n", __FUNCTION__, name, result, str);
+   }
+   else {
+      debug_printf("%s: %s = 0x%lx\n", __FUNCTION__, name, result);
+   }
 
    return result;
 }
@@ -627,6 +674,7 @@ void
 debug_dump_surface_bmp(const char *filename,
                        struct pipe_surface *surface)
 {
+#ifndef PIPE_SUBSYSTEM_WINDOWS_MINIPORT
    struct util_stream *stream;
    unsigned surface_usage;
    struct bmp_file_header bmfh;
@@ -693,6 +741,7 @@ error2:
    FREE(rgba);
 error1:
    ;
+#endif
 }
 
 #endif
diff --git a/src/gallium/auxiliary/util/u_keymap.c b/src/gallium/auxiliary/util/u_keymap.c
new file mode 100644
index 00000000000..01b17ddb1b3
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_keymap.c
@@ -0,0 +1,309 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Key lookup/associative container.
+ *
+ * Like Jose's u_hash_table, based on CSO cache code for now.
+ *
+ * Author: Brian Paul
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_error.h"
+
+#include "cso_cache/cso_hash.h"
+
+#include "util/u_memory.h"
+#include "util/u_keymap.h"
+
+
+struct keymap
+{
+   struct cso_hash *cso;   
+   unsigned key_size;
+   unsigned max_entries; /* XXX not obeyed net */
+   unsigned num_entries;
+   keymap_delete_func delete_func;
+};
+
+
+struct keymap_item
+{
+   void *key, *value;
+};
+
+
+/**
+ * This the default key-delete function used when the client doesn't
+ * provide one.
+ */
+static void
+default_delete_func(const struct keymap *map,
+                    const void *key, void *data, void *user)
+{
+   FREE((void*) data);
+}
+
+
+static INLINE struct keymap_item *
+hash_table_item(struct cso_hash_iter iter)
+{
+   return (struct keymap_item *) cso_hash_iter_data(iter);
+}
+
+
+/**
+ * Return 4-byte hash key for a block of bytes.
+ */
+static unsigned
+hash(const void *key, unsigned keySize)
+{
+   unsigned i, hash;
+
+   keySize /= 4; /* convert from bytes to uints */
+
+   hash = 0;
+   for (i = 0; i < keySize; i++) {
+      hash ^= (i + 1) * ((const unsigned *) key)[i];
+   }
+
+   /*hash = hash ^ (hash >> 11) ^ (hash >> 22);*/
+
+   return hash;
+}
+
+
+/**
+ * Create a new map.
+ * \param keySize  size of the keys in bytes
+ * \param maxEntries  max number of entries to allow (~0 = infinity)
+ * \param deleteFunc  optional callback to call when entries
+ *                    are deleted/replaced
+ */
+struct keymap *
+util_new_keymap(unsigned keySize, unsigned maxEntries,
+                 keymap_delete_func deleteFunc)
+{
+   struct keymap *map = MALLOC_STRUCT(keymap);
+   if (!map)
+      return NULL;
+   
+   map->cso = cso_hash_create();
+   if (!map->cso) {
+      FREE(map);
+      return NULL;
+   }
+   
+   map->max_entries = maxEntries;
+   map->num_entries = 0;
+   map->key_size = keySize;
+   map->delete_func = deleteFunc ? deleteFunc : default_delete_func;
+
+   return map;
+}
+
+
+/**
+ * Delete/free a keymap and all entries.  The deleteFunc that was given at
+ * create time will be called for each entry.
+ * \param user  user-provided pointer passed through to the delete callback
+ */
+void
+util_delete_keymap(struct keymap *map, void *user)
+{
+   util_keymap_remove_all(map, user);
+   cso_hash_delete(map->cso);
+   FREE(map);
+}
+
+
+static INLINE struct cso_hash_iter
+hash_table_find_iter(const struct keymap *map, const void *key,
+                     unsigned key_hash)
+{
+   struct cso_hash_iter iter;
+   struct keymap_item *item;
+   
+   iter = cso_hash_find(map->cso, key_hash);
+   while (!cso_hash_iter_is_null(iter)) {
+      item = (struct keymap_item *) cso_hash_iter_data(iter);
+      if (!memcmp(item->key, key, map->key_size))
+         break;
+      iter = cso_hash_iter_next(iter);
+   }
+   
+   return iter;
+}
+
+
+static INLINE struct keymap_item *
+hash_table_find_item(const struct keymap *map, const void *key,
+                     unsigned key_hash)
+{
+   struct cso_hash_iter iter = hash_table_find_iter(map, key, key_hash);
+   if (cso_hash_iter_is_null(iter)) {
+      return NULL;
+   }
+   else {
+      return hash_table_item(iter);
+   }
+}
+
+
+/**
+ * Insert a new key + data pointer into the table.
+ * Note: we create a copy of the key, but not the data!
+ * If the key is already present in the table, replace the existing
+ * entry (calling the delete callback on the previous entry).
+ * If the maximum capacity of the map is reached an old entry
+ * will be deleted (the delete callback will be called).
+ */
+boolean
+util_keymap_insert(struct keymap *map, const void *key,
+                   const void *data, void *user)
+{
+   unsigned key_hash;
+   struct keymap_item *item;
+   struct cso_hash_iter iter;
+
+   assert(map);
+
+   key_hash = hash(key, map->key_size);
+
+   item = hash_table_find_item(map, key, key_hash);
+   if (item) {
+      /* call delete callback for old entry/item */
+      map->delete_func(map, item->key, item->value, user);
+      item->value = (void *) data;
+      return TRUE;
+   }
+   
+   item = MALLOC_STRUCT(keymap_item);
+   if (!item)
+      return FALSE;
+
+   item->key = mem_dup(key, map->key_size);
+   item->value = (void *) data;
+   
+   iter = cso_hash_insert(map->cso, key_hash, item);
+   if (cso_hash_iter_is_null(iter)) {
+      FREE(item);
+      return FALSE;
+   }
+
+   map->num_entries++;
+
+   return TRUE;
+}
+
+
+/**
+ * Look up a key in the map and return the associated data pointer.
+ */
+const void *
+util_keymap_lookup(const struct keymap *map, const void *key)
+{
+   unsigned key_hash;
+   struct keymap_item *item;
+
+   assert(map);
+
+   key_hash = hash(key, map->key_size);
+
+   item = hash_table_find_item(map, key, key_hash);
+   if (!item)
+      return NULL;
+   
+   return item->value;
+}
+
+
+/**
+ * Remove an entry from the map.
+ * The delete callback will be called if the given key/entry is found.
+ * \param user  passed to the delete callback as the last param.
+ */
+void
+util_keymap_remove(struct keymap *map, const void *key, void *user)
+{
+   unsigned key_hash;
+   struct cso_hash_iter iter;
+   struct keymap_item *item;
+
+   assert(map);
+
+   key_hash = hash(key, map->key_size);
+
+   iter = hash_table_find_iter(map, key, key_hash);
+   if (cso_hash_iter_is_null(iter))
+      return;
+   
+   item = hash_table_item(iter);
+   assert(item);
+   map->delete_func(map, item->key, item->value, user);
+   FREE(item->key);
+   FREE(item);
+   
+   map->num_entries--;
+
+   cso_hash_erase(map->cso, iter);
+}
+
+
+/**
+ * Remove all entries from the map, calling the delete callback for each.
+ * \param user  passed to the delete callback as the last param.
+ */
+void
+util_keymap_remove_all(struct keymap *map, void *user)
+{
+   struct cso_hash_iter iter;
+   struct keymap_item *item;
+
+   assert(map);
+   
+   iter = cso_hash_first_node(map->cso);
+   while (!cso_hash_iter_is_null(iter)) {
+      item = (struct keymap_item *)
+         cso_hash_take(map->cso, cso_hash_iter_key(iter));
+      map->delete_func(map, item->key, item->value, user);
+      FREE(item->key);
+      FREE(item);
+      iter = cso_hash_first_node(map->cso);
+   }
+}
+
+
+extern void
+util_keymap_info(const struct keymap *map)
+{
+   debug_printf("Keymap %p: %u of max %u entries\n",
+                (void *) map, map->num_entries, map->max_entries);
+}
diff --git a/src/gallium/auxiliary/util/u_keymap.h b/src/gallium/auxiliary/util/u_keymap.h
new file mode 100644
index 00000000000..8d60a76fc3c
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_keymap.h
@@ -0,0 +1,68 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef U_KEYMAP_H
+#define U_KEYMAP_H
+
+#include "pipe/p_compiler.h"
+
+
+/** opaque keymap type */
+struct keymap;
+
+
+/** Delete/callback function type */
+typedef void (*keymap_delete_func)(const struct keymap *map,
+                                   const void *key, void *data,
+                                   void *user);
+
+
+extern struct keymap *
+util_new_keymap(unsigned keySize, unsigned maxEntries,
+                keymap_delete_func deleteFunc);
+
+extern void
+util_delete_keymap(struct keymap *map, void *user);
+
+extern boolean
+util_keymap_insert(struct keymap *map, const void *key,
+                   const void *data, void *user);
+
+extern const void *
+util_keymap_lookup(const struct keymap *map, const void *key);
+
+extern void
+util_keymap_remove(struct keymap *map, const void *key, void *user);
+
+extern void
+util_keymap_remove_all(struct keymap *map, void *user);
+
+extern void
+util_keymap_info(const struct keymap *map);
+
+
+#endif /* U_KEYMAP_H */
diff --git a/src/gallium/auxiliary/util/u_linear.c b/src/gallium/auxiliary/util/u_linear.c
new file mode 100644
index 00000000000..a76704ffc7f
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_linear.c
@@ -0,0 +1,69 @@
+
+#include "pipe/p_debug.h"
+#include "u_linear.h"
+
+void
+pipe_linear_to_tile(size_t src_stride, void *src_ptr,
+		    struct pipe_tile_info *t, void *dst_ptr)
+{
+   int x, y, z;
+   char *ptr;
+   size_t bytes = t->cols * t->block.size;
+
+
+   assert(pipe_linear_check_tile(t));
+
+   /* lets write lineary to the tiled buffer */
+   for (y = 0; y < t->tiles_y; y++) {
+      for (x = 0; x < t->tiles_x; x++) {
+	 /* this inner loop could be replace with SSE magic */
+	 ptr = (char*)src_ptr + src_stride * t->rows * y + bytes * x;
+	 for (z = 0; z < t->rows; z++) {
+	    memcpy(dst_ptr, ptr, bytes);
+	    dst_ptr += bytes;
+	    ptr += src_stride;
+	 }
+      }
+   }
+}
+
+void pipe_linear_from_tile(struct pipe_tile_info *t, void  *src_ptr,
+			   size_t dst_stride, void *dst_ptr)
+{
+   int x, y, z;
+   char *ptr;
+   size_t bytes = t->cols * t->block.size;
+
+   /* lets read lineary from the tiled buffer */
+   for (y = 0; y < t->tiles_y; y++) {
+      for (x = 0; x < t->tiles_x; x++) {
+	 /* this inner loop could be replace with SSE magic */
+	 ptr = (char*)dst_ptr + dst_stride * t->rows * y + bytes * x;
+	 for (z = 0; z < t->rows; z++) {
+	    memcpy(ptr, src_ptr, bytes);
+	    src_ptr += bytes;
+	    ptr += dst_stride;
+	 }
+      }
+   }
+}
+
+void
+pipe_linear_fill_info(struct pipe_tile_info *t,
+		      struct pipe_format_block *block,
+		      unsigned tile_width, unsigned tile_height,
+		      unsigned tiles_x, unsigned tiles_y)
+{
+   t->block = *block;
+
+   t->tile.width = tile_width;
+   t->tile.height = tile_height;
+   t->cols = t->tile.width / t->block.width;
+   t->rows = t->tile.height / t->block.height;
+   t->tile.size = t->cols * t->rows * t->block.size;
+
+   t->tiles_x = tiles_x;
+   t->tiles_y = tiles_y;
+   t->stride = t->cols * t->tiles_x * t->block.size;
+   t->size = t->tiles_x * t->tiles_y * t->tile.size;
+}
diff --git a/src/gallium/auxiliary/util/u_linear.h b/src/gallium/auxiliary/util/u_linear.h
new file mode 100644
index 00000000000..e337cfd7702
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_linear.h
@@ -0,0 +1,60 @@
+
+#ifndef U_LINEAR_H
+#define U_LINEAR_H
+
+#include "pipe/p_format.h"
+struct pipe_tile_info
+{
+   unsigned size;
+   unsigned stride;
+
+   /* The number of tiles */
+   unsigned tiles_x;
+   unsigned tiles_y;
+
+   /* size of each tile expressed in blocks */
+   unsigned cols;
+   unsigned rows;
+
+   /* Describe the tile in pixels */
+   struct pipe_format_block tile;
+
+   /* Describe each block within the tile */
+   struct pipe_format_block block;
+};
+
+void pipe_linear_to_tile(size_t src_stride, void *src_ptr,
+			 struct pipe_tile_info *t, void  *dst_ptr);
+
+void pipe_linear_from_tile(struct pipe_tile_info *t, void  *src_ptr,
+			   size_t dst_stride, void *dst_ptr);
+
+/**
+ * Convenience function to fillout a pipe_tile_info struct.
+ * @t info to fill out.
+ * @block block info about pixel layout
+ * @tile_width the width of the tile in pixels
+ * @tile_height the height of the tile in pixels
+ * @tiles_x number of tiles in x axis
+ * @tiles_y number of tiles in y axis
+ */
+void pipe_linear_fill_info(struct pipe_tile_info *t,
+			   struct pipe_format_block *block,
+			   unsigned tile_width, unsigned tile_height,
+			   unsigned tiles_x, unsigned tiles_y);
+
+static INLINE boolean pipe_linear_check_tile(struct pipe_tile_info *t)
+{
+   if (t->tile.size != t->block.size * t->cols * t->rows)
+      return FALSE;
+
+   if (t->stride != t->block.size * t->cols * t->tiles_x)
+      return FALSE;
+
+   if (t->size < t->stride * t->rows * t->tiles_y)
+      return FALSE;
+
+   return TRUE;
+}
+
+#endif /* U_LINEAR_H */
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index 1ae3234423a..aa4fa17b596 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -68,7 +68,7 @@ __inline double ceil(double val)
    return ceil_val;
 }
 
-#ifndef PIPE_SUBSYSTEM_WINDOWS_CE
+#ifndef PIPE_SUBSYSTEM_WINDOWS_CE_OGL
 __inline double floor(double val)
 {
    double floor_val;
diff --git a/src/gallium/auxiliary/util/u_memory.h b/src/gallium/auxiliary/util/u_memory.h
index 857102719dc..1a6b596421f 100644
--- a/src/gallium/auxiliary/util/u_memory.h
+++ b/src/gallium/auxiliary/util/u_memory.h
@@ -151,6 +151,8 @@ REALLOC( void *old_ptr, unsigned old_size, unsigned new_size )
 
 #define CALLOC_STRUCT(T)   (struct T *) CALLOC(1, sizeof(struct T))
 
+#define CALLOC_VARIANT_LENGTH_STRUCT(T,more_size)   ((struct T *) CALLOC(1, sizeof(struct T) + more_size))
+
 
 /**
  * Return memory on given byte alignment
diff --git a/src/gallium/auxiliary/util/u_mm.c b/src/gallium/auxiliary/util/u_mm.c
index 01dd67c810b..45ce257b5e5 100644
--- a/src/gallium/auxiliary/util/u_mm.c
+++ b/src/gallium/auxiliary/util/u_mm.c
@@ -31,7 +31,7 @@
 
 
 void
-mmDumpMemInfo(const struct mem_block *heap)
+u_mmDumpMemInfo(const struct mem_block *heap)
 {
    debug_printf("Memory heap %p:\n", (void *)heap);
    if (heap == 0) {
@@ -58,7 +58,7 @@ mmDumpMemInfo(const struct mem_block *heap)
 }
 
 struct mem_block *
-mmInit(int ofs, int size)
+u_mmInit(int ofs, int size)
 {
    struct mem_block *heap, *block;
   
@@ -165,7 +165,7 @@ SliceBlock(struct mem_block *p,
 
 
 struct mem_block *
-mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch)
+u_mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch)
 {
    struct mem_block *p;
    const int mask = (1 << align2)-1;
@@ -202,7 +202,7 @@ mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch)
 
 
 struct mem_block *
-mmFindBlock(struct mem_block *heap, int start)
+u_mmFindBlock(struct mem_block *heap, int start)
 {
    struct mem_block *p;
 
@@ -241,7 +241,7 @@ Join2Blocks(struct mem_block *p)
 }
 
 int
-mmFreeMem(struct mem_block *b)
+u_mmFreeMem(struct mem_block *b)
 {
    if (!b)
       return 0;
@@ -270,7 +270,7 @@ mmFreeMem(struct mem_block *b)
 
 
 void
-mmDestroy(struct mem_block *heap)
+u_mmDestroy(struct mem_block *heap)
 {
    struct mem_block *p;
 
diff --git a/src/gallium/auxiliary/util/u_mm.h b/src/gallium/auxiliary/util/u_mm.h
index b226b101cbe..ce20e487635 100644
--- a/src/gallium/auxiliary/util/u_mm.h
+++ b/src/gallium/auxiliary/util/u_mm.h
@@ -49,7 +49,7 @@ struct mem_block {
  * input: total size in bytes
  * return: a heap pointer if OK, NULL if error
  */
-extern struct mem_block *mmInit(int ofs, int size);
+extern struct mem_block *u_mmInit(int ofs, int size);
 
 /**
  * Allocate 'size' bytes with 2^align2 bytes alignment,
@@ -61,7 +61,7 @@ extern struct mem_block *mmInit(int ofs, int size);
  *		startSearch = linear offset from start of heap to begin search
  * return: pointer to the allocated block, 0 if error
  */
-extern struct mem_block *mmAllocMem(struct mem_block *heap, int size, int align2, 
+extern struct mem_block *u_mmAllocMem(struct mem_block *heap, int size, int align2, 
                             int startSearch);
 
 /**
@@ -69,23 +69,23 @@ extern struct mem_block *mmAllocMem(struct mem_block *heap, int size, int align2
  * input: pointer to a block
  * return: 0 if OK, -1 if error
  */
-extern int mmFreeMem(struct mem_block *b);
+extern int u_mmFreeMem(struct mem_block *b);
 
 /**
  * Free block starts at offset
  * input: pointer to a heap, start offset
  * return: pointer to a block
  */
-extern struct mem_block *mmFindBlock(struct mem_block *heap, int start);
+extern struct mem_block *u_mmFindBlock(struct mem_block *heap, int start);
 
 /**
  * destroy MM
  */
-extern void mmDestroy(struct mem_block *mmInit);
+extern void u_mmDestroy(struct mem_block *mmInit);
 
 /**
  * For debuging purpose.
  */
-extern void mmDumpMemInfo(const struct mem_block *mmInit);
+extern void u_mmDumpMemInfo(const struct mem_block *mmInit);
 
 #endif
diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h
new file mode 100644
index 00000000000..e2a8491e62c
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_sse.h
@@ -0,0 +1,77 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * SSE intrinsics portability header.
+ * 
+ * Although the SSE intrinsics are support by all modern x86 and x86-64 
+ * compilers, there are some intrisincs missing in some implementations 
+ * (especially older MSVC versions). This header abstracts that away.
+ */
+
+#ifndef U_SSE_H_
+#define U_SSE_H_
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_SSE)
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+
+/* MSVC before VC8 does not support the _mm_castxxx_yyy */
+#if defined(_MSC_VER) && _MSC_VER < 1500
+
+union __declspec(align(16)) m128_types {
+   __m128 m128;
+   __m128i m128i;
+   __m128d m128d;
+};
+
+static __inline __m128
+_mm_castsi128_ps(__m128i a)
+{
+   union m128_types u;
+   u.m128i = a;
+   return u.m128;
+}
+
+static __inline __m128i
+_mm_castps_si128(__m128 a)
+{
+   union m128_types u;
+   u.m128 = a;
+   return u.m128i;
+}
+
+#endif /* defined(_MSC_VER) && _MSC_VER < 1500 */
+
+#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
+
+#endif /* U_SSE_H_ */
diff --git a/src/gallium/auxiliary/util/u_tile.c b/src/gallium/auxiliary/util/u_tile.c
index 853c503f4ff..32f6b072a00 100644
--- a/src/gallium/auxiliary/util/u_tile.c
+++ b/src/gallium/auxiliary/util/u_tile.c
@@ -460,7 +460,7 @@ l8_put_tile_rgba(ubyte *dst,
       for (j = 0; j < w; j++, pRow += 4) {
          unsigned r;
          r = float_to_ubyte(pRow[0]);
-         *dst++ = r;
+         *dst++ = (ubyte) r;
       }
       p += src_stride;
    }
@@ -504,7 +504,7 @@ a8_put_tile_rgba(ubyte *dst,
       for (j = 0; j < w; j++, pRow += 4) {
          unsigned a;
          a = float_to_ubyte(pRow[3]);
-         *dst++ = a;
+         *dst++ = (ubyte) a;
       }
       p += src_stride;
    }
@@ -634,7 +634,7 @@ i8_put_tile_rgba(ubyte *dst,
       for (j = 0; j < w; j++, pRow += 4) {
          unsigned r;
          r = float_to_ubyte(pRow[0]);
-         *dst++ = r;
+         *dst++ = (ubyte) r;
       }
       p += src_stride;
    }
@@ -769,6 +769,32 @@ z24s8_get_tile_rgba(const unsigned *src,
 }
 
 
+/*** PIPE_FORMAT_Z32_FLOAT ***/
+
+/**
+ * Return each Z value as four floats in [0,1].
+ */
+static void
+z32f_get_tile_rgba(const float *src,
+                   unsigned w, unsigned h,
+                   float *p,
+                   unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = *src++;
+      }
+      p += dst_stride;
+   }
+}
+
+
 /*** PIPE_FORMAT_YCBCR / PIPE_FORMAT_YCBCR_REV ***/
 
 /**
@@ -913,6 +939,9 @@ pipe_tile_raw_to_rgba(enum pipe_format format,
    case PIPE_FORMAT_Z24S8_UNORM:
       z24s8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
       break;
+   case PIPE_FORMAT_Z32_FLOAT:
+      z32f_get_tile_rgba((float *) src, w, h, dst, dst_stride);
+      break;
    case PIPE_FORMAT_YCBCR:
       ycbcr_get_tile_rgba((ushort *) src, w, h, dst, dst_stride, FALSE);
       break;
diff --git a/src/gallium/auxiliary/util/u_time.c b/src/gallium/auxiliary/util/u_time.c
index bf7d1d1c8d5..57b80e56042 100644
--- a/src/gallium/auxiliary/util/u_time.c
+++ b/src/gallium/auxiliary/util/u_time.c
@@ -200,7 +200,7 @@ util_time_timeout(const struct util_time *start,
 }
 
 
-#if defined(PIPE_SUBSYSYEM_WINDOWS_DISPLAY)
+#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
 void util_time_sleep(unsigned usecs)
 {
    LONGLONG start, curr, end;
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index cb0631baf52..98554d7f521 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -64,9 +64,13 @@
 #define ROUNDUP16(k)  (((k) + 0xf) & ~0xf)
 
 
-#define CELL_MAX_SPUS 6
+#define CELL_MAX_SPUS 8
 
 #define CELL_MAX_SAMPLERS 4
+#define CELL_MAX_TEXTURE_LEVELS 12  /* 2k x 2k */
+#define CELL_MAX_CONSTANTS 32  /**< number of float[4] constants */
+#define CELL_MAX_WIDTH 1024    /**< max framebuffer width */
+#define CELL_MAX_HEIGHT 1024   /**< max framebuffer width */
 
 #define TILE_SIZE 32
 
@@ -94,43 +98,94 @@
 #define CELL_CMD_STATE_BIND_VS       18
 #define CELL_CMD_STATE_FRAGMENT_PROGRAM 19
 #define CELL_CMD_STATE_ATTRIB_FETCH  20
-#define CELL_CMD_VS_EXECUTE          22
-#define CELL_CMD_FLUSH_BUFFER_RANGE  23
+#define CELL_CMD_STATE_FS_CONSTANTS  21
+#define CELL_CMD_STATE_RASTERIZER    22
+#define CELL_CMD_VS_EXECUTE          23
+#define CELL_CMD_FLUSH_BUFFER_RANGE  24
+#define CELL_CMD_FENCE               25
 
 
+/** Command/batch buffers */
 #define CELL_NUM_BUFFERS 4
 #define CELL_BUFFER_SIZE (4*1024)  /**< 16KB would be the max */
 
 #define CELL_BUFFER_STATUS_FREE 10
 #define CELL_BUFFER_STATUS_USED 20
 
+/** Debug flags */
+#define CELL_DEBUG_CHECKER              (1 << 0)
+#define CELL_DEBUG_ASM                  (1 << 1)
+#define CELL_DEBUG_SYNC                 (1 << 2)
+#define CELL_DEBUG_FRAGMENT_OPS         (1 << 3)
+#define CELL_DEBUG_FRAGMENT_OP_FALLBACK (1 << 4)
+#define CELL_DEBUG_CMD                  (1 << 5)
+#define CELL_DEBUG_CACHE                (1 << 6)
 
-#define CELL_DEBUG_CHECKER  (1 << 0)
-#define CELL_DEBUG_SYNC     (1 << 1)
+#define CELL_FENCE_IDLE      0
+#define CELL_FENCE_EMITTED   1
+#define CELL_FENCE_SIGNALLED 2
 
+#define CELL_FACING_FRONT    0
+#define CELL_FACING_BACK     1
+
+struct cell_fence
+{
+   /** There's a 16-byte status qword per SPU */
+   volatile uint status[CELL_MAX_SPUS][4];
+};
 
 
-/** Max instructions for doing per-fragment operations */
-#define SPU_MAX_FRAGMENT_OPS_INSTS 64
+/**
+ * Fence command sent to SPUs.  In response, the SPUs will write
+ * CELL_FENCE_STATUS_SIGNALLED back to the fence status word in main memory.
+ */
+struct cell_command_fence
+{
+   uint64_t opcode;      /**< CELL_CMD_FENCE */
+   struct cell_fence *fence;
+};
 
 
 /**
  * Command to specify per-fragment operations state and generated code.
+ * Note that this is a variant-length structure, allocated with as 
+ * much memory as needed to hold the generated code; the "code"
+ * field *must* be the last field in the structure.  Also, the entire
+ * length of the structure (including the variant code field) must be
+ * a multiple of 8 bytes; we require that this structure itself be
+ * a multiple of 8 bytes, and that the generated code also be a multiple
+ * of 8 bytes.
+ *
+ * Also note that the dsa, blend, blend_color fields are really only needed
+ * for the fallback/C per-pixel code.  They're not used when we generate
+ * dynamic SPU fragment code (which is the normal case), and will eventually
+ * be removed from this structure.
  */
 struct cell_command_fragment_ops
 {
    uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
+
+   /* Fields for the fallback case */
    struct pipe_depth_stencil_alpha_state dsa;
    struct pipe_blend_state blend;
-   unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS];
+   struct pipe_blend_color blend_color;
+
+   /* Fields for the generated SPU code */
+   unsigned total_code_size;
+   unsigned front_code_index;
+   unsigned back_code_index;
+   /* this field has variant length, and must be the last field in 
+    * the structure
+    */
+   unsigned code[0];
 };
 
 
 /** Max instructions for fragment programs */
-#define SPU_MAX_FRAGMENT_PROGRAM_INSTS 128
+#define SPU_MAX_FRAGMENT_PROGRAM_INSTS 512
 
 /**
- * Command to send a fragment progra to SPUs.
+ * Command to send a fragment program to SPUs.
  */
 struct cell_command_fragment_program
 {
@@ -145,7 +200,7 @@ struct cell_command_fragment_program
  */
 struct cell_command_framebuffer
 {
-   uint64_t opcode;     /**< CELL_CMD_FRAMEBUFFER */
+   uint64_t opcode;     /**< CELL_CMD_STATE_FRAMEBUFFER */
    int width, height;
    void *color_start, *depth_start;
    enum pipe_format color_format, depth_format;
@@ -153,6 +208,16 @@ struct cell_command_framebuffer
 
 
 /**
+ * Tell SPUs about rasterizer state.
+ */
+struct cell_command_rasterizer
+{
+   uint64_t opcode;    /**< CELL_CMD_STATE_RASTERIZER */
+   struct pipe_rasterizer_state rasterizer;
+};
+
+
+/**
  * Clear framebuffer to the given value/color.
  */
 struct cell_command_clear_surface
@@ -248,23 +313,27 @@ struct cell_command_sampler
 struct cell_command_texture
 {
    uint64_t opcode;     /**< CELL_CMD_STATE_TEXTURE */
+   uint target;         /**< PIPE_TEXTURE_x */
    uint unit;
-   void *start;         /**< Address in main memory */
-   ushort width, height;
+   void *start[CELL_MAX_TEXTURE_LEVELS];   /**< Address in main memory */
+   ushort width[CELL_MAX_TEXTURE_LEVELS];
+   ushort height[CELL_MAX_TEXTURE_LEVELS];
+   ushort depth[CELL_MAX_TEXTURE_LEVELS];
 };
 
 
-/** XXX unions don't seem to work */
-/* XXX this should go away; all commands should be placed in batch buffers */
-struct cell_command
+#define MAX_SPU_FUNCTIONS 12
+/**
+ * Used to tell the PPU about the address of particular functions in the
+ * SPU's address space.
+ */
+struct cell_spu_function_info
 {
-#if 0
-   struct cell_command_framebuffer fb;
-   struct cell_command_clear_surface clear;
-   struct cell_command_render render;
-#endif
-   struct cell_command_vs vs;
-} ALIGN16_ATTRIB;
+   uint num;
+   char names[MAX_SPU_FUNCTIONS][16];
+   uint addrs[MAX_SPU_FUNCTIONS];
+   char pad[12];   /**< Pad struct to multiple of 16 bytes (256 currently) */
+};
 
 
 /** This is the object passed to spe_create_thread() */
@@ -273,11 +342,13 @@ struct cell_init_info
    unsigned id;
    unsigned num_spus;
    unsigned debug_flags;  /**< mask of CELL_DEBUG_x flags */
-   struct cell_command *cmd;
+   float inv_timebase;    /**< 1.0/timebase, for perf measurement */
 
    /** Buffers for command batches, vertex/index data */
    ubyte *buffers[CELL_NUM_BUFFERS];
    uint *buffer_status;  /**< points at cell_context->buffer_status */
+
+   struct cell_spu_function_info *spu_functions;
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index b28f4c5c31f..9358a47284c 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -24,6 +24,7 @@ SOURCES = \
 	cell_clear.c \
 	cell_context.c \
 	cell_draw_arrays.c \
+	cell_fence.c \
 	cell_flush.c \
 	cell_gen_fragment.c \
 	cell_gen_fp.c \
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c
index 16882c01295..962775cd335 100644
--- a/src/gallium/drivers/cell/ppu/cell_batch.c
+++ b/src/gallium/drivers/cell/ppu/cell_batch.c
@@ -28,6 +28,7 @@
 
 #include "cell_context.h"
 #include "cell_batch.h"
+#include "cell_fence.h"
 #include "cell_spu.h"
 
 
@@ -42,7 +43,9 @@
 uint
 cell_get_empty_buffer(struct cell_context *cell)
 {
-   uint buf = 0, tries = 0;
+   static uint prev_buffer = 0;
+   uint buf = (prev_buffer + 1) % CELL_NUM_BUFFERS;
+   uint tries = 0;
 
    /* Find a buffer that's marked as free by all SPUs */
    while (1) {
@@ -58,8 +61,13 @@ cell_get_empty_buffer(struct cell_context *cell)
                   cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED;
                }
                /*
-               printf("PPU: ALLOC BUFFER %u\n", buf);
+               printf("PPU: ALLOC BUFFER %u, %u tries\n", buf, tries);
                */
+               prev_buffer = buf;
+
+               /* release tex buffer associated w/ prev use of this batch buf */
+               cell_free_fenced_buffers(cell, &cell->fenced_buffers[buf]);
+
                return buf;
             }
          }
@@ -82,6 +90,37 @@ cell_get_empty_buffer(struct cell_context *cell)
 
 
 /**
+ * Append a fence command to the current batch buffer.
+ * Note that we're sure there's always room for this because of the
+ * adjusted size check in cell_batch_free_space().
+ */
+static void
+emit_fence(struct cell_context *cell)
+{
+   const uint batch = cell->cur_batch;
+   const uint size = cell->buffer_size[batch];
+   struct cell_command_fence *fence_cmd;
+   struct cell_fence *fence = &cell->fenced_buffers[batch].fence;
+   uint i;
+
+   /* set fence status to emitted, not yet signalled */
+   for (i = 0; i < cell->num_spus; i++) {
+      fence->status[i][0] = CELL_FENCE_EMITTED;
+   }
+
+   ASSERT(size + sizeof(struct cell_command_fence) <= CELL_BUFFER_SIZE);
+
+   fence_cmd = (struct cell_command_fence *) (cell->buffer[batch] + size);
+   fence_cmd->opcode = CELL_CMD_FENCE;
+   fence_cmd->fence = fence;
+
+   /* update batch buffer size */
+   cell->buffer_size[batch] = size + sizeof(struct cell_command_fence);
+   assert(sizeof(struct cell_command_fence) % 8 == 0);
+}
+
+
+/**
  * Flush the current batch buffer to the SPUs.
  * An empty buffer will be found and set as the new current batch buffer
  * for subsequent commands/data.
@@ -91,7 +130,7 @@ cell_batch_flush(struct cell_context *cell)
 {
    static boolean flushing = FALSE;
    uint batch = cell->cur_batch;
-   const uint size = cell->buffer_size[batch];
+   uint size = cell->buffer_size[batch];
    uint spu, cmd_word;
 
    assert(!flushing);
@@ -99,6 +138,14 @@ cell_batch_flush(struct cell_context *cell)
    if (size == 0)
       return;
 
+   /* Before we use this batch buffer, make sure any fenced texture buffers
+    * are released.
+    */
+   if (cell->fenced_buffers[batch].head) {
+      emit_fence(cell);
+      size = cell->buffer_size[batch];
+   }
+
    flushing = TRUE;
 
    assert(batch < CELL_NUM_BUFFERS);
@@ -139,6 +186,7 @@ uint
 cell_batch_free_space(const struct cell_context *cell)
 {
    uint free = CELL_BUFFER_SIZE - cell->buffer_size[cell->cur_batch];
+   free -= sizeof(struct cell_command_fence);
    return free;
 }
 
@@ -169,7 +217,7 @@ cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
 
    size = cell->buffer_size[cell->cur_batch];
 
-   if (size + bytes > CELL_BUFFER_SIZE) {
+   if (bytes > cell_batch_free_space(cell)) {
       cell_batch_flush(cell);
       size = 0;
    }
@@ -223,7 +271,7 @@ cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
 
    padbytes = (alignment - (size % alignment)) % alignment;
 
-   if (padbytes + size + bytes > CELL_BUFFER_SIZE) {
+   if (padbytes + bytes > cell_batch_free_space(cell)) {
       cell_batch_flush(cell);
       size = 0;
    }
diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c
index c9c0c721bbe..037635e4660 100644
--- a/src/gallium/drivers/cell/ppu/cell_clear.c
+++ b/src/gallium/drivers/cell/ppu/cell_clear.c
@@ -106,4 +106,17 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
       clr->surface = surfIndex;
       clr->value = clearValue;
    }
+
+   /* Technically, the surface's contents are now known and cleared,
+    * so we could set the status to PIPE_SURFACE_STATUS_CLEAR.  But
+    * it turns out it's quite painful to recognize when any particular
+    * surface goes from PIPE_SURFACE_STATUS_CLEAR to 
+    * PIPE_SURFACE_STATUS_DEFINED (i.e. with known contents), because
+    * the drawing commands could be operating on numerous draw buffers,
+    * which we'd have to iterate through to set all their stati...
+    * For now, we cheat a bit and set the surface's status to DEFINED
+    * right here.  Later we should revisit this and set the status to
+    * CLEAR here, and find a better place to set the status to DEFINED.
+    */
+   ps->status = PIPE_SURFACE_STATUS_DEFINED;
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 71f1a3049d1..22d552d8e3d 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -47,6 +47,7 @@
 #include "cell_clear.h"
 #include "cell_context.h"
 #include "cell_draw_arrays.h"
+#include "cell_fence.h"
 #include "cell_flush.h"
 #include "cell_state.h"
 #include "cell_surface.h"
@@ -62,6 +63,8 @@ cell_destroy_context( struct pipe_context *pipe )
 {
    struct cell_context *cell = cell_context(pipe);
 
+   util_delete_keymap(cell->fragment_ops_cache, NULL);
+
    cell_spu_exit(cell);
 
    align_free(cell);
@@ -85,13 +88,16 @@ cell_draw_create(struct cell_context *cell)
 }
 
 
-#ifdef DEBUG
 static const struct debug_named_value cell_debug_flags[] = {
    {"checker", CELL_DEBUG_CHECKER},/**< modulate tile clear color by SPU ID */
+   {"asm", CELL_DEBUG_ASM},        /**< dump SPU asm code */
    {"sync", CELL_DEBUG_SYNC},      /**< SPUs do synchronous DMA */
+   {"fragops", CELL_DEBUG_FRAGMENT_OPS}, /**< SPUs emit fragment ops debug messages*/
+   {"fragopfallback", CELL_DEBUG_FRAGMENT_OP_FALLBACK}, /**< SPUs use reference implementation for fragment ops*/
+   {"cmd", CELL_DEBUG_CMD},       /**< SPUs dump command buffer info */
+   {"cache", CELL_DEBUG_CACHE},   /**< report texture cache stats on exit */
    {NULL, 0}
 };
-#endif
 
 
 struct pipe_context *
@@ -99,6 +105,7 @@ cell_create_context(struct pipe_screen *screen,
                     struct cell_winsys *cws)
 {
    struct cell_context *cell;
+   uint i;
 
    /* some fields need to be 16-byte aligned, so align the whole object */
    cell = (struct cell_context*) align_malloc(sizeof(struct cell_context), 16);
@@ -125,11 +132,14 @@ cell_create_context(struct pipe_screen *screen,
    cell_init_state_functions(cell);
    cell_init_shader_functions(cell);
    cell_init_surface_functions(cell);
-   cell_init_texture_functions(cell);
    cell_init_vertex_functions(cell);
 
    cell->draw = cell_draw_create(cell);
 
+   /* Create cache of fragment ops generated code */
+   cell->fragment_ops_cache =
+      util_new_keymap(sizeof(struct cell_fragment_ops_key), ~0, NULL);
+
    cell_init_vbuf(cell);
 
    draw_set_rasterize_stage(cell->draw, cell->vbuf);
@@ -143,17 +153,31 @@ cell_create_context(struct pipe_screen *screen,
                                               cell_debug_flags, 
                                               0 );
 
+   for (i = 0; i < CELL_NUM_BUFFERS; i++)
+      cell_fence_init(&cell->fenced_buffers[i].fence);
+
+
    /*
     * SPU stuff
     */
-   cell->num_spus = 6;
-   /* XXX is this in SDK 3.0 only?
-   cell->num_spus = spe_cpu_info_get(SPE_COUNT_PHYSICAL_SPES, -1);
-   */
+   /* This call only works with SDK 3.0.  Anyone still using 2.1??? */
+   cell->num_cells = spe_cpu_info_get(SPE_COUNT_PHYSICAL_CPU_NODES, -1);
+   cell->num_spus = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, 0);
+   if (cell->debug_flags) {
+      printf("Cell: found %d Cell(s) with %u SPUs\n",
+             cell->num_cells, cell->num_spus);
+   }
+   if (getenv("CELL_NUM_SPUS")) {
+      cell->num_spus = atoi(getenv("CELL_NUM_SPUS"));
+      assert(cell->num_spus > 0);
+   }
 
    cell_start_spus(cell);
 
    cell_init_batch_buffers(cell);
 
+   /* make sure SPU initializations are done before proceeding */
+   cell_flush_int(cell, CELL_FLUSH_WAIT);
+
    return &cell->pipe;
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 14914b9c6f8..eb1397bb3fa 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -38,6 +38,7 @@
 #include "cell/common.h"
 #include "rtasm/rtasm_ppc_spe.h"
 #include "tgsi/tgsi_scan.h"
+#include "util/u_keymap.h"
 
 
 struct cell_vbuf_render;
@@ -67,31 +68,29 @@ struct cell_fragment_shader_state
 
 
 /**
- * Cell blend state atom, subclass of pipe_blend_state.
+ * Key for mapping per-fragment state to cached SPU machine code.
+ *  keymap(cell_fragment_ops_key) => cell_command_fragment_ops
  */
-struct cell_blend_state
+struct cell_fragment_ops_key
 {
-   struct pipe_blend_state base;
-
-   /**
-    * Generated code to perform alpha blending
-    */
-   struct spe_function code;
+   struct pipe_blend_state blend;
+   struct pipe_blend_color blend_color;
+   struct pipe_depth_stencil_alpha_state dsa;
+   enum pipe_format color_format;
+   enum pipe_format zs_format;
 };
 
 
+struct cell_buffer_node;
+
 /**
- * Cell depth/stencil/alpha state atom, subclass of
- * pipe_depth_stencil_alpha_state.
+ * Fenced buffer list.  List of buffers which can be unreferenced after
+ * the fence has been executed/signalled.
  */
-struct cell_depth_stencil_alpha_state
+struct cell_buffer_list
 {
-   struct pipe_depth_stencil_alpha_state base;
-
-   /**
-    * Generated code to perform alpha, stencil, and depth testing on the SPE
-    */
-   struct spe_function code;
+   struct cell_fence fence ALIGN16_ATTRIB;
+   struct cell_buffer_node *head;
 };
 
 
@@ -104,10 +103,10 @@ struct cell_context
 
    struct cell_winsys *winsys;
 
-   const struct cell_blend_state *blend;
+   const struct pipe_blend_state *blend;
    const struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
    uint num_samplers;
-   const struct cell_depth_stencil_alpha_state   *depth_stencil;
+   const struct pipe_depth_stencil_alpha_state *depth_stencil;
    const struct pipe_rasterizer_state *rasterizer;
    const struct cell_vertex_shader_state *vs;
    const struct cell_fragment_shader_state *fs;
@@ -135,6 +134,11 @@ struct cell_context
    uint *tex_map;
 
    uint dirty;
+   uint dirty_textures;  /* bitmask of texture units */
+   uint dirty_samplers;  /* bitmask of sampler units */
+
+   /** Cache of code generated for per-fragment ops */
+   struct keymap *fragment_ops_cache;
 
    /** The primitive drawing context */
    struct draw_context *draw;
@@ -149,8 +153,9 @@ struct cell_context
    /** Mapped constant buffers */
    void *mapped_constants[PIPE_SHADER_TYPES];
 
+   struct cell_spu_function_info spu_functions ALIGN16_ATTRIB;
 
-   uint num_spus;
+   uint num_cells, num_spus;
 
    /** Buffers for command batches, vertex/index data */
    uint buffer_size[CELL_NUM_BUFFERS];
@@ -162,6 +167,14 @@ struct cell_context
    uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4] ALIGN16_ATTRIB;
 
 
+   /** Associated with each command/batch buffer is a list of pipe_buffers
+    * that are fenced.  When the last command in a buffer is executed, the
+    * fence will be signalled, indicating that any pipe_buffers preceeding
+    * that fence can be unreferenced (and probably freed).
+    */
+   struct cell_buffer_list fenced_buffers[CELL_NUM_BUFFERS];
+
+
    struct spe_function attrib_fetch;
    unsigned attrib_fetch_offsets[PIPE_MAX_ATTRIBS];
 
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.c b/src/gallium/drivers/cell/ppu/cell_fence.c
new file mode 100644
index 00000000000..867b5dcaa09
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_fence.c
@@ -0,0 +1,168 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include <unistd.h>
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_fence.h"
+#include "cell_texture.h"
+
+
+void
+cell_fence_init(struct cell_fence *fence)
+{
+   uint i;
+   ASSERT_ALIGN16(fence->status);
+   for (i = 0; i < CELL_MAX_SPUS; i++) {
+      fence->status[i][0] = CELL_FENCE_IDLE;
+   }
+}
+
+
+boolean
+cell_fence_signalled(const struct cell_context *cell,
+                     const struct cell_fence *fence)
+{
+   uint i;
+   for (i = 0; i < cell->num_spus; i++) {
+      if (fence->status[i][0] != CELL_FENCE_SIGNALLED)
+         return FALSE;
+      /*assert(fence->status[i][0] == CELL_FENCE_EMITTED);*/
+   }
+   return TRUE;
+}
+
+
+void
+cell_fence_finish(const struct cell_context *cell,
+                  const struct cell_fence *fence)
+{
+   while (!cell_fence_signalled(cell, fence)) {
+      usleep(10);
+   }
+
+#ifdef DEBUG
+   {
+      uint i;
+      for (i = 0; i < cell->num_spus; i++) {
+         assert(fence->status[i][0] == CELL_FENCE_SIGNALLED);
+      }
+   }
+#endif
+}
+
+
+
+
+struct cell_buffer_node
+{
+   struct pipe_buffer *buffer;
+   struct cell_buffer_node *next;
+};
+
+
+static void
+cell_add_buffer_to_list(struct cell_context *cell,
+                        struct cell_buffer_list *list,
+                        struct pipe_buffer *buffer)
+{
+   struct pipe_screen *ps = cell->pipe.screen;
+   struct cell_buffer_node *node = CALLOC_STRUCT(cell_buffer_node);
+   /* create new list node which references the buffer, insert at head */
+   if (node) {
+      pipe_buffer_reference(ps, &node->buffer, buffer);
+      node->next = list->head;
+      list->head = node;
+   }
+}
+
+
+/**
+ * Wait for completion of the given fence, then unreference any buffers
+ * on the list.
+ * This typically unrefs/frees texture buffers after any rendering which uses
+ * them has completed.
+ */
+void
+cell_free_fenced_buffers(struct cell_context *cell,
+                         struct cell_buffer_list *list)
+{
+   if (list->head) {
+      struct pipe_screen *ps = cell->pipe.screen;
+      struct cell_buffer_node *node;
+
+      cell_fence_finish(cell, &list->fence);
+
+      /* traverse the list, unreferencing buffers, freeing nodes */
+      node = list->head;
+      while (node) {
+         struct cell_buffer_node *next = node->next;
+         assert(node->buffer);
+         pipe_buffer_unmap(ps, node->buffer);
+#if 0
+         printf("Unref buffer %p\n", node->buffer);
+         if (node->buffer->refcount == 1)
+            printf("   Delete!\n");
+#endif
+         pipe_buffer_reference(ps, &node->buffer, NULL);
+         FREE(node);
+         node = next;
+      }
+      list->head = NULL;
+   }
+}
+
+
+/**
+ * This should be called for each render command.
+ * Any texture buffers that are current bound will be added to a fenced
+ * list to be freed later when the fence is executed/signalled.
+ */
+void
+cell_add_fenced_textures(struct cell_context *cell)
+{
+   struct cell_buffer_list *list = &cell->fenced_buffers[cell->cur_batch];
+   uint i;
+
+   for (i = 0; i < cell->num_textures; i++) {
+      struct cell_texture *ct = cell->texture[i];
+      if (ct) {
+         uint level;
+         for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+            if (ct->tiled_buffer[level]) {
+#if 0
+               printf("Adding texture %p buffer %p to list\n",
+                      ct, ct->tiled_buffer[level]);
+#endif
+               cell_add_buffer_to_list(cell, list, ct->tiled_buffer[level]);
+            }
+         }
+      }
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.h b/src/gallium/drivers/cell/ppu/cell_fence.h
new file mode 100644
index 00000000000..536b4ba411a
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_fence.h
@@ -0,0 +1,57 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_FENCE_H
+#define CELL_FENCE_H
+
+
+extern void
+cell_fence_init(struct cell_fence *fence);
+
+
+extern boolean
+cell_fence_signalled(const struct cell_context *cell,
+                     const struct cell_fence *fence);
+
+
+extern void
+cell_fence_finish(const struct cell_context *cell,
+                  const struct cell_fence *fence);
+
+
+
+extern void
+cell_free_fenced_buffers(struct cell_context *cell,
+                         struct cell_buffer_list *list);
+
+
+extern void
+cell_add_fenced_textures(struct cell_context *cell);
+
+
+#endif /* CELL_FENCE_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_flush.c b/src/gallium/drivers/cell/ppu/cell_flush.c
index 6596b720101..a64967b4b9e 100644
--- a/src/gallium/drivers/cell/ppu/cell_flush.c
+++ b/src/gallium/drivers/cell/ppu/cell_flush.c
@@ -49,7 +49,7 @@ cell_flush(struct pipe_context *pipe, unsigned flags,
       flags |= CELL_FLUSH_WAIT;
    }
 
-   if (flags & PIPE_FLUSH_SWAPBUFFERS)
+   if (flags & (PIPE_FLUSH_SWAPBUFFERS | PIPE_FLUSH_RENDER_CACHE))
       flags |= CELL_FLUSH_WAIT;
 
    draw_flush( cell->draw );
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 6ffe94eb14a..96a1743fc10 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -37,7 +37,7 @@
  * \author Brian Paul
  */
 
-
+#include <math.h>
 #include "pipe/p_defines.h"
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
@@ -51,25 +51,43 @@
 #include "cell_gen_fp.h"
 
 
-/** Set to 1 to enable debug/disassembly printfs */
-#define DISASSEM 01
+#define MAX_TEMPS 16
+#define MAX_IMMED  8
 
+#define CHAN_X  0
+#define CHAN_Y  1
+#define CHAN_Z  2
+#define CHAN_W  3
 
 /**
  * Context needed during code generation.
  */
 struct codegen
 {
+   struct cell_context *cell;
    int inputs_reg;      /**< 1st function parameter */
    int outputs_reg;     /**< 2nd function parameter */
    int constants_reg;   /**< 3rd function parameter */
-   int temp_regs[8][4]; /**< maps TGSI temps to SPE registers */
+   int temp_regs[MAX_TEMPS][4]; /**< maps TGSI temps to SPE registers */
+   int imm_regs[MAX_IMMED][4];  /**< maps TGSI immediates to SPE registers */
+
+   int num_imm;  /**< number of immediates */
 
    int one_reg;         /**< register containing {1.0, 1.0, 1.0, 1.0} */
 
    /** Per-instruction temps / intermediate temps */
    int num_itemps;
-   int itemps[3];
+   int itemps[12];
+
+   /** Current IF/ELSE/ENDIF nesting level */
+   int if_nesting;
+   /** Index of execution mask register */
+   int exec_mask_reg;
+
+   /** KIL mask: indicates which fragments have been killed */
+   int kill_mask_reg;
+
+   int frame_size;  /**< Stack frame size, in words */
 
    struct spe_function *f;
    boolean error;
@@ -112,19 +130,78 @@ get_const_one_reg(struct codegen *gen)
 {
    if (gen->one_reg <= 0) {
       gen->one_reg = spe_allocate_available_register(gen->f);
-   }
 
-   /* one = {1.0, 1.0, 1.0, 1.0} */
-   spe_load_float(gen->f, gen->one_reg, 1.0f);
-#if DISASSEM
-   printf("il\tr%d, 1.0f\n", gen->one_reg);
-#endif
+      spe_indent(gen->f, 4);
+      spe_comment(gen->f, -4, "INIT CONSTANT 1.0:");
+
+      /* one = {1.0, 1.0, 1.0, 1.0} */
+      spe_load_float(gen->f, gen->one_reg, 1.0f);
+
+      spe_indent(gen->f, -4);
+   }
 
    return gen->one_reg;
 }
 
 
 /**
+ * Return index of the pixel execution mask.
+ * The register is allocated an initialized upon the first call.
+ *
+ * The pixel execution mask controls which pixels in a quad are
+ * modified, according to surrounding conditionals, loops, etc.
+ */
+static int
+get_exec_mask_reg(struct codegen *gen)
+{
+   if (gen->exec_mask_reg <= 0) {
+      gen->exec_mask_reg = spe_allocate_available_register(gen->f);
+
+      spe_indent(gen->f, 4);
+      spe_comment(gen->f, -4, "INIT EXEC MASK = ~0:");
+
+      /* exec_mask = {~0, ~0, ~0, ~0} */
+      spe_load_int(gen->f, gen->exec_mask_reg, ~0);
+
+      spe_indent(gen->f, -4);
+   }
+
+   return gen->exec_mask_reg;
+}
+
+
+static boolean
+is_register_src(struct codegen *gen, int channel,
+                const struct tgsi_full_src_register *src)
+{
+   int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel);
+   int sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel);
+
+   if (swizzle > TGSI_SWIZZLE_W || sign_op != TGSI_UTIL_SIGN_KEEP) {
+      return FALSE;
+   }
+   if (src->SrcRegister.File == TGSI_FILE_TEMPORARY ||
+       src->SrcRegister.File == TGSI_FILE_IMMEDIATE) {
+      return TRUE;
+   }
+   return FALSE;
+}
+
+  
+static boolean
+is_memory_dst(struct codegen *gen, int channel,
+              const struct tgsi_full_dst_register *dst)
+{
+   if (dst->DstRegister.File == TGSI_FILE_OUTPUT) {
+      return TRUE;
+   }
+   else {
+      return FALSE;
+   }
+}
+
+  
+/**
  * Return the index of the SPU temporary containing the named TGSI
  * source register.  If the TGSI register is a TGSI_FILE_TEMPORARY we
  * just return the corresponding SPE register.  If the TGIS register
@@ -136,35 +213,93 @@ get_src_reg(struct codegen *gen,
             int channel,
             const struct tgsi_full_src_register *src)
 {
-   int reg;
+   int reg = -1;
+   int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel);
+   boolean reg_is_itemp = FALSE;
+   uint sign_op;
+
+   assert(swizzle >= TGSI_SWIZZLE_X);
+   assert(swizzle <= TGSI_EXTSWIZZLE_ONE);
+
+   if (swizzle == TGSI_EXTSWIZZLE_ONE) {
+      /* Load const one float and early out */
+      reg = get_const_one_reg(gen);
+   }
+   else if (swizzle == TGSI_EXTSWIZZLE_ZERO) {
+      /* Load const zero float and early out */
+      reg = get_itemp(gen);
+      spe_xor(gen->f, reg, reg, reg);
+   }
+   else {
+      assert(swizzle < 4);
 
-   /* XXX need to examine src swizzle info here.
-    * That will involve changing the channel var...
+      switch (src->SrcRegister.File) {
+      case TGSI_FILE_TEMPORARY:
+         reg = gen->temp_regs[src->SrcRegister.Index][swizzle];
+         break;
+      case TGSI_FILE_INPUT:
+         {
+            /* offset is measured in quadwords, not bytes */
+            int offset = src->SrcRegister.Index * 4 + swizzle;
+            reg = get_itemp(gen);
+            reg_is_itemp = TRUE;
+            /* Load:  reg = memory[(machine_reg) + offset] */
+            spe_lqd(gen->f, reg, gen->inputs_reg, offset * 16);
+         }
+         break;
+      case TGSI_FILE_IMMEDIATE:
+         reg = gen->imm_regs[src->SrcRegister.Index][swizzle];
+         break;
+      case TGSI_FILE_CONSTANT:
+         {
+            /* offset is measured in quadwords, not bytes */
+            int offset = src->SrcRegister.Index * 4 + swizzle;
+            reg = get_itemp(gen);
+            reg_is_itemp = TRUE;
+            /* Load:  reg = memory[(machine_reg) + offset] */
+            spe_lqd(gen->f, reg, gen->constants_reg, offset * 16);
+         }
+         break;
+      default:
+         assert(0);
+      }
+   }
+
+   /*
+    * Handle absolute value, negate or set-negative of src register.
     */
+   sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel);
+   if (sign_op != TGSI_UTIL_SIGN_KEEP) {
+      /*
+       * All sign ops are done by manipulating bit 31, the IEEE float sign bit.
+       */
+      const int bit31mask_reg = get_itemp(gen);
+      int result_reg;
+
+      if (reg_is_itemp) {
+         /* re-use 'reg' for the result */
+         result_reg = reg;
+      }
+      else {
+         /* alloc a new reg for the result */
+         result_reg = get_itemp(gen);
+      }
 
+      /* mask with bit 31 set, the rest cleared */
+      spe_load_uint(gen->f, bit31mask_reg, (1 << 31));
 
-   switch (src->SrcRegister.File) {
-   case TGSI_FILE_TEMPORARY:
-      reg = gen->temp_regs[src->SrcRegister.Index][channel];
-      break;
-   case TGSI_FILE_INPUT:
-      {
-         /* offset is measured in quadwords, not bytes */
-         int offset = src->SrcRegister.Index * 4 + channel;
-         reg = get_itemp(gen);
-         /* Load:  reg = memory[(machine_reg) + offset] */
-         spe_lqd(gen->f, reg, gen->inputs_reg, offset);
-#if DISASSEM
-         printf("lqd\tr%d, r%d + %d\n", reg, gen->inputs_reg, offset);
-#endif
+      if (sign_op == TGSI_UTIL_SIGN_CLEAR) {
+         spe_andc(gen->f, result_reg, reg, bit31mask_reg);
       }
-      break;
-   case TGSI_FILE_IMMEDIATE:
-      /* xxx fall-through for now / fix */
-   case TGSI_FILE_CONSTANT:
-      /* xxx fall-through for now / fix */
-   default:
-      assert(0);
+      else if (sign_op == TGSI_UTIL_SIGN_SET) {
+         spe_and(gen->f, result_reg, reg, bit31mask_reg);
+      }
+      else {
+         assert(sign_op == TGSI_UTIL_SIGN_TOGGLE);
+         spe_xor(gen->f, result_reg, reg, bit31mask_reg);
+      }
+
+      reg = result_reg;
    }
 
    return reg;
@@ -183,11 +318,14 @@ get_dst_reg(struct codegen *gen,
             int channel,
             const struct tgsi_full_dst_register *dest)
 {
-   int reg;
+   int reg = -1;
 
    switch (dest->DstRegister.File) {
    case TGSI_FILE_TEMPORARY:
-      reg = gen->temp_regs[dest->DstRegister.Index][channel];
+      if (gen->if_nesting > 0)
+         reg = get_itemp(gen);
+      else
+         reg = gen->temp_regs[dest->DstRegister.Index][channel];
       break;
    case TGSI_FILE_OUTPUT:
       reg = get_itemp(gen);
@@ -211,19 +349,59 @@ store_dest_reg(struct codegen *gen,
                int value_reg, int channel,
                const struct tgsi_full_dst_register *dest)
 {
+   /*
+    * XXX need to implement dst reg clamping/saturation
+    */
+#if 0
+   switch (inst->Instruction.Saturate) {
+   case TGSI_SAT_NONE:
+      break;
+   case TGSI_SAT_ZERO_ONE:
+      break;
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      break;
+   default:
+      assert( 0 );
+   }
+#endif
+
    switch (dest->DstRegister.File) {
    case TGSI_FILE_TEMPORARY:
-      /* no-op */
+      if (gen->if_nesting > 0) {
+         int d_reg = gen->temp_regs[dest->DstRegister.Index][channel];
+         int exec_reg = get_exec_mask_reg(gen);
+         /* Mix d with new value according to exec mask:
+          * d[i] = mask_reg[i] ? value_reg : d_reg
+          */
+         spe_selb(gen->f, d_reg, d_reg, value_reg, exec_reg);
+      }
+      else {
+         /* we're not inside a condition or loop: do nothing special */
+
+      }
       break;
    case TGSI_FILE_OUTPUT:
       {
          /* offset is measured in quadwords, not bytes */
          int offset = dest->DstRegister.Index * 4 + channel;
-         /* Store: memory[(machine_reg) + offset] = reg */
-         spe_stqd(gen->f, value_reg, gen->outputs_reg, offset);
-#if DISASSEM
-         printf("stqd\tr%d, r%d + %d\n", value_reg, gen->outputs_reg, offset);
-#endif
+         if (gen->if_nesting > 0) {
+            int exec_reg = get_exec_mask_reg(gen);
+            int curval_reg = get_itemp(gen);
+            /* First read the current value from memory:
+             * Load:  curval = memory[(machine_reg) + offset]
+             */
+            spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
+            /* Mix curval with newvalue according to exec mask:
+             * d[i] = mask_reg[i] ? value_reg : d_reg
+             */
+            spe_selb(gen->f, curval_reg, curval_reg, value_reg, exec_reg);
+            /* Store: memory[(machine_reg) + offset] = curval */
+            spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
+         }
+         else {
+            /* Store: memory[(machine_reg) + offset] = reg */
+            spe_stqd(gen->f, value_reg, gen->outputs_reg, offset * 16);
+         }
       }
       break;
    default:
@@ -232,27 +410,114 @@ store_dest_reg(struct codegen *gen,
 }
 
 
+
+static void
+emit_prologue(struct codegen *gen)
+{
+   gen->frame_size = 1024; /* XXX temporary, should be dynamic */
+
+   spe_comment(gen->f, -4, "Function prologue:");
+
+   /* save $lr on stack     # stqd $lr,16($sp) */
+   spe_stqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+
+   if (gen->frame_size >= 512) {
+      /* offset is too large for ai instruction */
+      int offset_reg = spe_allocate_available_register(gen->f);
+      int sp_reg = spe_allocate_available_register(gen->f);
+      /* offset = -framesize */
+      spe_load_int(gen->f, offset_reg, -gen->frame_size);
+      /* sp = $sp */
+      spe_move(gen->f, sp_reg, SPE_REG_SP);
+      /* $sp = $sp + offset_reg */
+      spe_a(gen->f, SPE_REG_SP, SPE_REG_SP, offset_reg);
+      /* save $sp in stack frame */
+      spe_stqd(gen->f, sp_reg, SPE_REG_SP, 0);
+      /* clean up */
+      spe_release_register(gen->f, offset_reg);
+      spe_release_register(gen->f, sp_reg);
+   }
+   else {
+      /* save stack pointer    # stqd $sp,-frameSize($sp) */
+      spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+
+      /* adjust stack pointer  # ai $sp,$sp,-frameSize */
+      spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+   }
+}
+
+
+static void
+emit_epilogue(struct codegen *gen)
+{
+   const int return_reg = 3;
+
+   spe_comment(gen->f, -4, "Function epilogue:");
+
+   spe_comment(gen->f, 0, "return the killed mask");
+   if (gen->kill_mask_reg > 0) {
+      /* shader called KIL, return the "alive" mask */
+      spe_move(gen->f, return_reg, gen->kill_mask_reg);
+   }
+   else {
+      /* return {0,0,0,0} */
+      spe_load_uint(gen->f, return_reg, 0);
+   }
+
+   spe_comment(gen->f, 0, "restore stack and return");
+   if (gen->frame_size >= 512) {
+      /* offset is too large for ai instruction */
+      int offset_reg = spe_allocate_available_register(gen->f);
+      /* offset = framesize */
+      spe_load_int(gen->f, offset_reg, gen->frame_size);
+      /* $sp = $sp + offset */
+      spe_a(gen->f, SPE_REG_SP, SPE_REG_SP, offset_reg);
+      /* clean up */
+      spe_release_register(gen->f, offset_reg);
+   }
+   else {
+      /* restore stack pointer    # ai $sp,$sp,frameSize */
+      spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, gen->frame_size);
+   }
+
+   /* restore $lr              # lqd $lr,16($sp) */
+   spe_lqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+
+   /* return from function call */
+   spe_bi(gen->f, SPE_REG_RA, 0, 0);
+}
+
+
 static boolean
 emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
-   int ch;
+   int ch, src_reg[4], dst_reg[4];
+
+   spe_comment(gen->f, -4, "MOV:");
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         int src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-         int dst_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-         /* XXX we don't always need to actually emit a mov instruction here */
-         spe_move(gen->f, dst_reg, src_reg);
-#if DISASSEM
-         printf("mov\tr%d, r%d\n", dst_reg, src_reg);
-#endif
-         store_dest_reg(gen, dst_reg, ch, &inst->FullDstRegisters[0]);
+         src_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         dst_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         if (is_register_src(gen, ch, &inst->FullSrcRegisters[0]) &&
+             is_memory_dst(gen, ch, &inst->FullDstRegisters[0])) {
+            /* special-case: register to memory store */
+            store_dest_reg(gen, src_reg[ch], ch, &inst->FullDstRegisters[0]);
+         }
+         else {
+            spe_move(gen->f, dst_reg[ch], src_reg[ch]);
+            store_dest_reg(gen, dst_reg[ch], ch, &inst->FullDstRegisters[0]);
+         }
          free_itemps(gen);
       }
    }
    return true;
 }
 
-
 /**
  * Emit addition instructions.  Recall that a single TGSI_OPCODE_ADD
  * becomes (up to) four SPU "fa" instructions because we're doing SOA
@@ -261,24 +526,25 @@ emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
 static boolean
 emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
-   int ch;
-   /* Loop over Red/Green/Blue/Alpha channels */
+   int ch, s1_reg[4], s2_reg[4], d_reg[4];
+
+   spe_comment(gen->f, -4, "ADD:");
+   /* Loop over Red/Green/Blue/Alpha channels, fetch src operands */
    for (ch = 0; ch < 4; ch++) {
       /* If the dest R, G, B or A writemask is enabled... */
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         /* get indexes of the two src, one dest SPE registers */
-         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
-         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+   /* Loop over Red/Green/Blue/Alpha channels, do the add, store results */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          /* Emit actual SPE instruction: d = s1 + s2 */
-         spe_fa(gen->f, d_reg, s1_reg, s2_reg);
-#if DISASSEM
-         printf("fa\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
-#endif
-
+         spe_fa(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
          /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
-         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
          /* Free any intermediate temps we allocated */
          free_itemps(gen);
       }
@@ -286,6 +552,99 @@ emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
    return true;
 }
 
+/**
+ * Emit subtract.  See emit_ADD for comments.
+ */
+static boolean
+emit_SUB(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], s2_reg[4], d_reg[4];
+   spe_comment(gen->f, -4, "SUB:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         /* d = s1 - s2 */
+         spe_fs(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+/**
+ * Emit multiply add.  See emit_ADD for comments.
+ */
+static boolean
+emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4];
+   spe_comment(gen->f, -4, "MAD:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         /* d = s1 * s2 + s3 */
+         spe_fma(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch], s3_reg[ch]);
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+
+/**
+ * Emit linear interpolate.  See emit_ADD for comments.
+ */
+static boolean
+emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4], tmp_reg[4];
+   spe_comment(gen->f, -4, "LERP:");
+   /* setup/get src/dst/temp regs */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         tmp_reg[ch] = get_itemp(gen);
+      }
+   }
+
+   /* d = s3 + s1(s2 - s3) */
+   /* do all subtracts, then all fma, then all stores to better pipeline */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         spe_fs(gen->f, tmp_reg[ch], s2_reg[ch], s3_reg[ch]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         spe_fma(gen->f, d_reg[ch], tmp_reg[ch], s1_reg[ch], s3_reg[ch]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+      }
+   }
+   free_itemps(gen);
+   return true;
+}
 
 /**
  * Emit multiply.  See emit_ADD for comments.
@@ -293,17 +652,63 @@ emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
 static boolean
 emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
+   int ch, s1_reg[4], s2_reg[4], d_reg[4];
+   spe_comment(gen->f, -4, "MUL:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         /* d = s1 * s2 */
+         spe_fm(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+/**
+ * Emit reciprocal.  See emit_ADD for comments.
+ */
+static boolean
+emit_RCP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
    int ch;
+   spe_comment(gen->f, -4, "RCP:");
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
          int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-         /* d = s1 * s2 */
-         spe_fm(gen->f, d_reg, s1_reg, s2_reg);
-#if DISASSEM
-         printf("fm\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
-#endif
+         /* d = 1/s1 */
+         spe_frest(gen->f, d_reg, s1_reg);
+         spe_fi(gen->f, d_reg, s1_reg, d_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+/**
+ * Emit reciprocal sqrt.  See emit_ADD for comments.
+ */
+static boolean
+emit_RSQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "RSQ:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         /* d = 1/s1 */
+         spe_frsqest(gen->f, d_reg, s1_reg);
+         spe_fi(gen->f, d_reg, s1_reg, d_reg);
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
       }
@@ -311,6 +716,270 @@ emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
    return true;
 }
 
+/**
+ * Emit absolute value.  See emit_ADD for comments.
+ */
+static boolean
+emit_ABS(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "ABS:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         const int bit31mask_reg = get_itemp(gen);
+
+         /* mask with bit 31 set, the rest cleared */  
+         spe_load_uint(gen->f, bit31mask_reg, (1 << 31));
+
+         /* d = sign bit cleared in s1 */
+         spe_andc(gen->f, d_reg, s1_reg, bit31mask_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+/**
+ * Emit 3 component dot product.  See emit_ADD for comments.
+ */
+static boolean
+emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   int s1x_reg, s1y_reg, s1z_reg;
+   int s2x_reg, s2y_reg, s2z_reg;
+   int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
+
+   spe_comment(gen->f, -4, "DP3:");
+
+   s1x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   s2x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   s1y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   s1z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s2z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+
+   /* t0 = x0 * x1 */
+   spe_fm(gen->f, t0_reg, s1x_reg, s2x_reg);
+
+   /* t1 = y0 * y1 */
+   spe_fm(gen->f, t1_reg, s1y_reg, s2y_reg);
+
+   /* t0 = z0 * z1 + t0 */
+   spe_fma(gen->f, t0_reg, s1z_reg, s2z_reg, t0_reg);
+
+   /* t0 = t0 + t1 */
+   spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         spe_move(gen->f, d_reg, t0_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+
+   free_itemps(gen);
+   return true;
+}
+
+/**
+ * Emit 4 component dot product.  See emit_ADD for comments.
+ */
+static boolean
+emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   int s0x_reg, s0y_reg, s0z_reg, s0w_reg;
+   int s1x_reg, s1y_reg, s1z_reg, s1w_reg;
+   int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
+
+   spe_comment(gen->f, -4, "DP4:");
+
+   s0x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   s1x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   s0y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s1y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   s0z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s1z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   s0w_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[0]);
+   s1w_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
+
+   /* t0 = x0 * x1 */
+   spe_fm(gen->f, t0_reg, s0x_reg, s1x_reg);
+
+   /* t1 = y0 * y1 */
+   spe_fm(gen->f, t1_reg, s0y_reg, s1y_reg);
+
+   /* t0 = z0 * z1 + t0 */
+   spe_fma(gen->f, t0_reg, s0z_reg, s1z_reg, t0_reg);
+
+   /* t1 = w0 * w1 + t1 */
+   spe_fma(gen->f, t1_reg, s0w_reg, s1w_reg, t1_reg);
+
+   /* t0 = t0 + t1 */
+   spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         spe_move(gen->f, d_reg, t0_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+
+   free_itemps(gen);
+   return true;
+}
+
+/**
+ * Emit homogeneous dot product.  See emit_ADD for comments.
+ */
+static boolean
+emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   /* XXX rewrite this function to look more like DP3/DP4 */
+   int ch;
+   spe_comment(gen->f, -4, "DPH:");
+
+   int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   int tmp_reg = get_itemp(gen);
+
+   /* t = x0 * x1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   /* t = y0 * y1 + t */
+   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   /* t = z0 * z1 + t */
+   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   s2_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
+   /* t = w1 + t */
+   spe_fa(gen->f, tmp_reg, s2_reg, tmp_reg);
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         spe_move(gen->f, d_reg, tmp_reg);
+         store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+
+   free_itemps(gen);
+   return true;
+}
+
+/**
+ * Emit 3-component vector normalize.
+ */
+static boolean
+emit_NRM3(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   int src_reg[3];
+   int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
+
+   spe_comment(gen->f, -4, "NRM3:");
+
+   src_reg[0] = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   src_reg[1] = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   src_reg[2] = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+
+   /* t0 = x * x */
+   spe_fm(gen->f, t0_reg, src_reg[0], src_reg[0]);
+
+   /* t1 = y * y */
+   spe_fm(gen->f, t1_reg, src_reg[1], src_reg[1]);
+
+   /* t0 = z * z + t0 */
+   spe_fma(gen->f, t0_reg, src_reg[2], src_reg[2], t0_reg);
+
+   /* t0 = t0 + t1 */
+   spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
+
+   /* t1 = 1.0 / sqrt(t0) */
+   spe_frsqest(gen->f, t1_reg, t0_reg);
+   spe_fi(gen->f, t1_reg, t0_reg, t1_reg);
+
+   for (ch = 0; ch < 3; ch++) {  /* NOTE: omit W channel */
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         /* dst = src[ch] * t1 */
+         spe_fm(gen->f, d_reg, src_reg[ch], t1_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+
+   free_itemps(gen);
+   return true;
+}
+
+
+/**
+ * Emit cross product.  See emit_ADD for comments.
+ */
+static boolean
+emit_XPD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   spe_comment(gen->f, -4, "XPD:");
+
+   int s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   int s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   int tmp_reg = get_itemp(gen);
+
+   /* t = z0 * y1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   /* t = y0 * z1 - t */
+   spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_X)) {
+      store_dest_reg(gen, tmp_reg, CHAN_X, &inst->FullDstRegisters[0]);
+   }
+
+   s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   /* t = x0 * z1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   /* t = z0 * x1 - t */
+   spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_Y)) {
+      store_dest_reg(gen, tmp_reg, CHAN_Y, &inst->FullDstRegisters[0]);
+   }
+
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   /* t = y0 * x1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   /* t = x0 * y1 - t */
+   spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_Z)) {
+      store_dest_reg(gen, tmp_reg, CHAN_Z, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return true;
+}
 
 /**
  * Emit set-if-greater-than.
@@ -323,6 +992,8 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch;
 
+   spe_comment(gen->f, -4, "SGT:");
+
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
@@ -331,26 +1002,776 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
          /* d = (s1 > s2) */
          spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
-#if DISASSEM
-         printf("fcgt\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
-#endif
 
          /* convert d from 0x0/0xffffffff to 0.0/1.0 */
          /* d = d & one_reg */
          spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
-#if DISASSEM
-         printf("and\tr%d, r%d, r%d\n", d_reg, d_reg, get_const_one_reg(gen));
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit set-if_less-then.  See emit_SGT for comments.
+ */
+static boolean
+emit_SLT(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SLT:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 < s2) */
+         spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = d & one_reg */
+         spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit set-if_greater-then-or-equal.  See emit_SGT for comments.
+ */
+static boolean
+emit_SGE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SGE:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 >= s2) */
+         spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = ~d & one_reg */
+         spe_andc(gen->f, d_reg, get_const_one_reg(gen), d_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit set-if_less-then-or-equal.  See emit_SGT for comments.
+ */
+static boolean
+emit_SLE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SLE:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 <= s2) */
+         spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = ~d & one_reg */
+         spe_andc(gen->f, d_reg, get_const_one_reg(gen), d_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit set-if_equal.  See emit_SGT for comments.
+ */
+static boolean
+emit_SEQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SEQ:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 == s2) */
+         spe_fceq(gen->f, d_reg, s1_reg, s2_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = d & one_reg */
+         spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit set-if_not_equal.  See emit_SGT for comments.
+ */
+static boolean
+emit_SNE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SNE:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 != s2) */
+         spe_fceq(gen->f, d_reg, s1_reg, s2_reg);
+         spe_nor(gen->f, d_reg, d_reg, d_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = d & one_reg */
+         spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit compare.  See emit_SGT for comments.
+ */
+static boolean
+emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "CMP:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         int zero_reg = get_itemp(gen);
+   
+         spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
+
+         /* d = (s1 < 0) ? s2 : s3 */
+         spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
+         spe_selb(gen->f, d_reg, s3_reg, s2_reg, d_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit trunc.  
+ * Convert float to signed int
+ * Convert signed int to float
+ */
+static boolean
+emit_TRUNC(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "TRUNC:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* Convert float to int */
+         spe_cflts(gen->f, d_reg, s1_reg, 0);
+
+         /* Convert int to float */
+         spe_csflt(gen->f, d_reg, d_reg, 0);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit floor.  
+ * If negative int subtract one
+ * Convert float to signed int
+ * Convert signed int to float
+ */
+static boolean
+emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "FLR:");
+
+   int zero_reg = get_itemp(gen);
+   spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
+   
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         int tmp_reg = get_itemp(gen);
+
+         /* If negative, subtract 1.0 */
+         spe_fcgt(gen->f, tmp_reg, zero_reg, s1_reg);
+         spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), tmp_reg);
+         spe_fs(gen->f, tmp_reg, s1_reg, tmp_reg);
+
+         /* Convert float to int */
+         spe_cflts(gen->f, tmp_reg, tmp_reg, 0);
+
+         /* Convert int to float */
+         spe_csflt(gen->f, d_reg, tmp_reg, 0);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Compute frac = Input - FLR(Input)
+ */
+static boolean
+emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "FRC:");
+
+   int zero_reg = get_itemp(gen);
+   spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         int tmp_reg = get_itemp(gen);
+
+         /* If negative, subtract 1.0 */
+         spe_fcgt(gen->f, tmp_reg, zero_reg, s1_reg);
+         spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), tmp_reg);
+         spe_fs(gen->f, tmp_reg, s1_reg, tmp_reg);
+
+         /* Convert float to int */
+         spe_cflts(gen->f, tmp_reg, tmp_reg, 0);
+
+         /* Convert int to float */
+         spe_csflt(gen->f, tmp_reg, tmp_reg, 0);
+
+         /* d = s1 - FLR(s1) */
+         spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+
+#if 0
+static void
+print_functions(struct cell_context *cell)
+{
+   struct cell_spu_function_info *funcs = &cell->spu_functions;
+   uint i;
+   for (i = 0; i < funcs->num; i++) {
+      printf("SPU func %u: %s at %u\n",
+             i, funcs->names[i], funcs->addrs[i]);
+   }
+}
 #endif
 
+
+static uint
+lookup_function(struct cell_context *cell, const char *funcname)
+{
+   const struct cell_spu_function_info *funcs = &cell->spu_functions;
+   uint i, addr = 0;
+   for (i = 0; i < funcs->num; i++) {
+      if (strcmp(funcs->names[i], funcname) == 0) {
+         addr = funcs->addrs[i];
+      }
+   }
+   assert(addr && "spu function not found");
+   return addr / 4;  /* discard 2 least significant bits */
+}
+
+
+/**
+ * Emit code to call a SPU function.
+ * Used to implement instructions like SIN/COS/POW/TEX/etc.
+ * If scalar, only the X components of the src regs are used, and the
+ * result is replicated across the dest register's XYZW components.
+ */
+static boolean
+emit_function_call(struct codegen *gen,
+                   const struct tgsi_full_instruction *inst,
+                   char *funcname, uint num_args, boolean scalar)
+{
+   const uint addr = lookup_function(gen->cell, funcname);
+   char comment[100];
+   int s_regs[3];
+   int func_called = FALSE;
+   uint a, ch;
+   int retval_reg = -1;
+
+   assert(num_args <= 3);
+
+   snprintf(comment, sizeof(comment), "CALL %s:", funcname);
+   spe_comment(gen->f, -4, comment);
+
+   if (scalar) {
+      for (a = 0; a < num_args; a++) {
+         s_regs[a] = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[a]);
+      }
+      /* we'll call the function, put the return value in this register,
+       * then replicate it across all write-enabled components in d_reg.
+       */
+      retval_reg = spe_allocate_available_register(gen->f);
+   }
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int d_reg;
+         ubyte usedRegs[SPE_NUM_REGS];
+         uint i, numUsed;
+
+         if (!scalar) {
+            for (a = 0; a < num_args; a++) {
+               s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
+            }
+         }
+
+         d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         if (!scalar || !func_called) {
+            /* for a scalar function, we'll really only call the function once */
+
+            numUsed = spe_get_registers_used(gen->f, usedRegs);
+            assert(numUsed < gen->frame_size / 16 - 2);
+
+            /* save registers to stack */
+            for (i = 0; i < numUsed; i++) {
+               uint reg = usedRegs[i];
+               int offset = 2 + i;
+               spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+            }
+
+            /* setup function arguments */
+            for (a = 0; a < num_args; a++) {
+               spe_move(gen->f, 3 + a, s_regs[a]);
+            }
+
+            /* branch to function, save return addr */
+            spe_brasl(gen->f, SPE_REG_RA, addr);
+
+            /* save function's return value */
+            if (scalar)
+               spe_move(gen->f, retval_reg, 3);
+            else
+               spe_move(gen->f, d_reg, 3);
+
+            /* restore registers from stack */
+            for (i = 0; i < numUsed; i++) {
+               uint reg = usedRegs[i];
+               if (reg != d_reg && reg != retval_reg) {
+                  int offset = 2 + i;
+                  spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+               }
+            }
+
+            func_called = TRUE;
+         }
+
+         if (scalar) {
+            spe_move(gen->f, d_reg, retval_reg);
+         }
+
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
       }
    }
 
+   if (scalar) {
+      spe_release_register(gen->f, retval_reg);
+   }
+
    return true;
 }
 
 
+static boolean
+emit_TEX(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const uint target = inst->InstructionExtTexture.Texture;
+   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   uint addr;
+   int ch;
+   int coord_regs[4], d_regs[4];
+
+   switch (target) {
+   case TGSI_TEXTURE_1D:
+   case TGSI_TEXTURE_2D:
+      addr = lookup_function(gen->cell, "spu_tex_2d");
+      break;
+   case TGSI_TEXTURE_3D:
+      addr = lookup_function(gen->cell, "spu_tex_3d");
+      break;
+   case TGSI_TEXTURE_CUBE:
+      addr = lookup_function(gen->cell, "spu_tex_cube");
+      break;
+   default:
+      ASSERT(0 && "unsupported texture target");
+      return FALSE;
+   }
+
+   assert(inst->FullSrcRegisters[1].SrcRegister.File == TGSI_FILE_SAMPLER);
+
+   spe_comment(gen->f, -4, "CALL tex:");
+
+   /* get src/dst reg info */
+   for (ch = 0; ch < 4; ch++) {
+      coord_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      d_regs[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+   }
+
+   {
+      ubyte usedRegs[SPE_NUM_REGS];
+      uint i, numUsed;
+
+      numUsed = spe_get_registers_used(gen->f, usedRegs);
+      assert(numUsed < gen->frame_size / 16 - 2);
+
+      /* save registers to stack */
+      for (i = 0; i < numUsed; i++) {
+         uint reg = usedRegs[i];
+         int offset = 2 + i;
+         spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+      }
+
+      /* setup function arguments (XXX depends on target) */
+      for (i = 0; i < 4; i++) {
+         spe_move(gen->f, 3 + i, coord_regs[i]);
+      }
+      spe_load_uint(gen->f, 7, unit); /* sampler unit */
+
+      /* branch to function, save return addr */
+      spe_brasl(gen->f, SPE_REG_RA, addr);
+
+      /* save function's return values (four pixel's colors) */
+      for (i = 0; i < 4; i++) {
+         spe_move(gen->f, d_regs[i], 3 + i);
+      }
+
+      /* restore registers from stack */
+      for (i = 0; i < numUsed; i++) {
+         uint reg = usedRegs[i];
+         if (reg != d_regs[0] &&
+             reg != d_regs[1] &&
+             reg != d_regs[2] &&
+             reg != d_regs[3]) {
+            int offset = 2 + i;
+            spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+         }
+      }
+   }
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         store_dest_reg(gen, d_regs[ch], ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return TRUE;
+}
+
+
+/**
+ * KILL if any of src reg values are less than zero.
+ */
+static boolean
+emit_KIL(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   int s_regs[4], kil_reg = -1, cmp_reg, zero_reg;
+
+   spe_comment(gen->f, -4, "CALL kil:");
+
+   /* zero = {0,0,0,0} */
+   zero_reg = get_itemp(gen);
+   spe_load_uint(gen->f, zero_reg, 0);
+
+   cmp_reg = get_itemp(gen);
+
+   /* get src regs */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         s_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      }
+   }
+
+   /* test if any src regs are < 0 */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         if (kil_reg >= 0) {
+            /* cmp = 0 > src ? : ~0 : 0 */
+            spe_fcgt(gen->f, cmp_reg, zero_reg, s_regs[ch]);
+            /* kil = kil | cmp */
+            spe_or(gen->f, kil_reg, kil_reg, cmp_reg);
+         }
+         else {
+            kil_reg = get_itemp(gen);
+            /* kil = 0 > src ? : ~0 : 0 */
+            spe_fcgt(gen->f, kil_reg, zero_reg, s_regs[ch]);
+         }
+      }
+   }
+
+   if (gen->if_nesting) {
+      /* may have been a conditional kil */
+      spe_and(gen->f, kil_reg, kil_reg, gen->exec_mask_reg);
+   }
+
+   /* allocate the kill mask reg if needed */
+   if (gen->kill_mask_reg <= 0) {
+      gen->kill_mask_reg = spe_allocate_available_register(gen->f);
+      spe_move(gen->f, gen->kill_mask_reg, kil_reg);
+   }
+   else {
+      spe_or(gen->f, gen->kill_mask_reg, gen->kill_mask_reg, kil_reg);
+   }
+
+   free_itemps(gen);
+
+   return TRUE;
+}
+
+
+
+/**
+ * Emit max.  See emit_SGT for comments.
+ */
+static boolean
+emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s0_reg[4], s1_reg[4], d_reg[4], tmp_reg[4];
+
+   spe_comment(gen->f, -4, "MAX:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         s0_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         tmp_reg[ch] = get_itemp(gen);         
+      }
+   }
+
+   /* d = (s0 > s1) ? s0 : s1 */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         spe_fcgt(gen->f, tmp_reg[ch], s0_reg[ch], s1_reg[ch]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         spe_selb(gen->f, d_reg[ch], s1_reg[ch], s0_reg[ch], tmp_reg[ch]);
+      }
+   }
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+      }
+   }
+
+   free_itemps(gen);
+   return true;
+}
+
+/**
+ * Emit max.  See emit_SGT for comments.
+ */
+static boolean
+emit_MIN(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s0_reg[4], s1_reg[4], d_reg[4], tmp_reg[4];
+
+   spe_comment(gen->f, -4, "MIN:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         s0_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         tmp_reg[ch] = get_itemp(gen);         
+      }
+   }
+
+   /* d = (s1 > s0) ? s0 : s1 */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         spe_fcgt(gen->f, tmp_reg[ch], s1_reg[ch], s0_reg[ch]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         spe_selb(gen->f, d_reg[ch], s1_reg[ch], s0_reg[ch], tmp_reg[ch]);
+      }
+   }
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+      }
+   }
+
+   free_itemps(gen);
+   return true;
+}
+
+static boolean
+emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int channel = 0;
+   const int exec_reg = get_exec_mask_reg(gen);
+
+   spe_comment(gen->f, -4, "IF:");
+
+   /* update execution mask with the predicate register */
+   int tmp_reg = get_itemp(gen);
+   int s1_reg = get_src_reg(gen, channel, &inst->FullSrcRegisters[0]);
+
+   /* tmp = (s1_reg == 0) */
+   spe_ceqi(gen->f, tmp_reg, s1_reg, 0);
+   /* tmp = !tmp */
+   spe_complement(gen->f, tmp_reg, tmp_reg);
+   /* exec_mask = exec_mask & tmp */
+   spe_and(gen->f, exec_reg, exec_reg, tmp_reg);
+
+   gen->if_nesting++;
+
+   free_itemps(gen);
+
+   return true;
+}
+
+
+static boolean
+emit_ELSE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int exec_reg = get_exec_mask_reg(gen);
+
+   spe_comment(gen->f, -4, "ELSE:");
+
+   /* exec_mask = !exec_mask */
+   spe_complement(gen->f, exec_reg, exec_reg);
+
+   return true;
+}
+
+
+static boolean
+emit_ENDIF(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int exec_reg = get_exec_mask_reg(gen);
+
+   spe_comment(gen->f, -4, "ENDIF:");
+
+   /* XXX todo: pop execution mask */
+
+   spe_load_int(gen->f, exec_reg, ~0x0);
+
+   gen->if_nesting--;
+   return true;
+}
+
+
+static boolean
+emit_DDX_DDY(struct codegen *gen, const struct tgsi_full_instruction *inst,
+             boolean ddx)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, ddx ? "DDX:" : "DDY:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         int t1_reg = get_itemp(gen);
+         int t2_reg = get_itemp(gen);
+
+         spe_splat_word(gen->f, t1_reg, s_reg, 0); /* upper-left pixel */
+         if (ddx) {
+            spe_splat_word(gen->f, t2_reg, s_reg, 1); /* upper-right pixel */
+         }
+         else {
+            spe_splat_word(gen->f, t2_reg, s_reg, 2); /* lower-left pixel */
+         }
+         spe_fs(gen->f, d_reg, t2_reg, t1_reg);
+
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+
+
+
 /**
  * Emit END instruction.
  * We just return from the shader function at this point.
@@ -361,11 +1782,8 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
 static boolean
 emit_END(struct codegen *gen)
 {
-   /* return from function call */
-   spe_bi(gen->f, SPE_REG_RA, 0, 0);
-#if DISASSEM
-   printf("bi\trRA\n");
-#endif
+   spe_comment(gen->f, -4, "END:");
+   emit_epilogue(gen);
    return true;
 }
 
@@ -379,19 +1797,101 @@ emit_instruction(struct codegen *gen,
 {
    switch (inst->Instruction.Opcode) {
    case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_SWZ:
       return emit_MOV(gen, inst);
    case TGSI_OPCODE_MUL:
       return emit_MUL(gen, inst);
    case TGSI_OPCODE_ADD:
       return emit_ADD(gen, inst);
+   case TGSI_OPCODE_SUB:
+      return emit_SUB(gen, inst);
+   case TGSI_OPCODE_MAD:
+      return emit_MAD(gen, inst);
+   case TGSI_OPCODE_LERP:
+      return emit_LERP(gen, inst);
+   case TGSI_OPCODE_DP3:
+      return emit_DP3(gen, inst);
+   case TGSI_OPCODE_DP4:
+      return emit_DP4(gen, inst);
+   case TGSI_OPCODE_DPH:
+      return emit_DPH(gen, inst);
+   case TGSI_OPCODE_NRM:
+      return emit_NRM3(gen, inst);
+   case TGSI_OPCODE_XPD:
+      return emit_XPD(gen, inst);
+   case TGSI_OPCODE_RCP:
+      return emit_RCP(gen, inst);
+   case TGSI_OPCODE_RSQ:
+      return emit_RSQ(gen, inst);
+   case TGSI_OPCODE_ABS:
+      return emit_ABS(gen, inst);
    case TGSI_OPCODE_SGT:
       return emit_SGT(gen, inst);
+   case TGSI_OPCODE_SLT:
+      return emit_SLT(gen, inst);
+   case TGSI_OPCODE_SGE:
+      return emit_SGE(gen, inst);
+   case TGSI_OPCODE_SLE:
+      return emit_SLE(gen, inst);
+   case TGSI_OPCODE_SEQ:
+      return emit_SEQ(gen, inst);
+   case TGSI_OPCODE_SNE:
+      return emit_SNE(gen, inst);
+   case TGSI_OPCODE_CMP:
+      return emit_CMP(gen, inst);
+   case TGSI_OPCODE_MAX:
+      return emit_MAX(gen, inst);
+   case TGSI_OPCODE_MIN:
+      return emit_MIN(gen, inst);
+   case TGSI_OPCODE_TRUNC:
+      return emit_TRUNC(gen, inst);
+   case TGSI_OPCODE_FLR:
+      return emit_FLR(gen, inst);
+   case TGSI_OPCODE_FRC:
+      return emit_FRC(gen, inst);
    case TGSI_OPCODE_END:
       return emit_END(gen);
 
+   case TGSI_OPCODE_COS:
+      return emit_function_call(gen, inst, "spu_cos", 1, TRUE);
+   case TGSI_OPCODE_SIN:
+      return emit_function_call(gen, inst, "spu_sin", 1, TRUE);
+   case TGSI_OPCODE_POW:
+      return emit_function_call(gen, inst, "spu_pow", 2, TRUE);
+   case TGSI_OPCODE_EXPBASE2:
+      return emit_function_call(gen, inst, "spu_exp2", 1, TRUE);
+   case TGSI_OPCODE_LOGBASE2:
+      return emit_function_call(gen, inst, "spu_log2", 1, TRUE);
+   case TGSI_OPCODE_TEX:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXD:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXB:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXL:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXP:
+      return emit_TEX(gen, inst);
+   case TGSI_OPCODE_KIL:
+      return emit_KIL(gen, inst);
+
+   case TGSI_OPCODE_IF:
+      return emit_IF(gen, inst);
+   case TGSI_OPCODE_ELSE:
+      return emit_ELSE(gen, inst);
+   case TGSI_OPCODE_ENDIF:
+      return emit_ENDIF(gen, inst);
+
+   case TGSI_OPCODE_DDX:
+      return emit_DDX_DDY(gen, inst, true);
+   case TGSI_OPCODE_DDY:
+      return emit_DDX_DDY(gen, inst, false);
+
    /* XXX lots more cases to do... */
 
    default:
+      fprintf(stderr, "Cell: unimplemented TGSI instruction %d!\n",
+              inst->Instruction.Opcode);
       return false;
    }
 
@@ -401,48 +1901,93 @@ emit_instruction(struct codegen *gen,
 
 
 /**
+ * Emit code for a TGSI immediate value (vector of four floats).
+ * This involves register allocation and initialization.
+ * XXX the initialization should be done by a "prepare" stage, not
+ * per quad execution!
+ */
+static boolean
+emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed)
+{
+   int ch;
+
+   assert(gen->num_imm < MAX_TEMPS);
+
+   spe_comment(gen->f, -4, "IMMEDIATE:");
+
+   for (ch = 0; ch < 4; ch++) {
+      float val = immed->u.ImmediateFloat32[ch].Float;
+
+      if (ch > 0 && val == immed->u.ImmediateFloat32[ch - 1].Float) {
+         /* re-use previous register */
+         gen->imm_regs[gen->num_imm][ch] = gen->imm_regs[gen->num_imm][ch - 1];
+      }
+      else {
+         int reg = spe_allocate_available_register(gen->f);
+
+         if (reg < 0)
+            return false;
+
+         /* update immediate map */
+         gen->imm_regs[gen->num_imm][ch] = reg;
+
+         /* emit initializer instruction */
+         spe_load_float(gen->f, reg, val);
+      }
+   }
+
+   gen->num_imm++;
+
+   return true;
+}
+
+
+
+/**
  * Emit "code" for a TGSI declaration.
  * We only care about TGSI TEMPORARY register declarations at this time.
  * For each TGSI TEMPORARY we allocate four SPE registers.
  */
-static void
-emit_declaration(struct codegen *gen, const struct tgsi_full_declaration *decl)
+static boolean
+emit_declaration(struct cell_context *cell,
+                 struct codegen *gen, const struct tgsi_full_declaration *decl)
 {
    int i, ch;
 
    switch (decl->Declaration.File) {
    case TGSI_FILE_TEMPORARY:
-#if DISASSEM
-      printf("Declare temp reg %d .. %d\n",
-             decl->DeclarationRange.First,
-             decl->DeclarationRange.Last);
-#endif
       for (i = decl->DeclarationRange.First;
            i <= decl->DeclarationRange.Last;
            i++) {
+         assert(i < MAX_TEMPS);
          for (ch = 0; ch < 4; ch++) {
             gen->temp_regs[i][ch] = spe_allocate_available_register(gen->f);
+            if (gen->temp_regs[i][ch] < 0)
+               return false; /* out of regs */
          }
 
          /* XXX if we run out of SPE registers, we need to spill
           * to SPU memory.  someday...
           */
 
-#if DISASSEM
-         printf("  SPE regs: %d %d %d %d\n",
-                gen->temp_regs[i][0],
-                gen->temp_regs[i][1],
-                gen->temp_regs[i][2],
-                gen->temp_regs[i][3]);
-#endif
+         {
+            char buf[100];
+            sprintf(buf, "TGSI temp[%d] maps to SPU regs [$%d $%d $%d $%d]", i,
+                    gen->temp_regs[i][0], gen->temp_regs[i][1],
+                    gen->temp_regs[i][2], gen->temp_regs[i][3]);
+            spe_comment(gen->f, -4, buf);
+         }
       }
       break;
    default:
       ; /* ignore */
    }
+
+   return true;
 }
 
 
+
 /**
  * Translate TGSI shader code to SPE instructions.  This is done when
  * the state tracker gives us a new shader (via pipe->create_fs_state()).
@@ -460,6 +2005,7 @@ cell_gen_fragment_program(struct cell_context *cell,
    struct codegen gen;
 
    memset(&gen, 0, sizeof(gen));
+   gen.cell = cell;
    gen.f = f;
 
    /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
@@ -472,50 +2018,50 @@ cell_gen_fragment_program(struct cell_context *cell,
    spe_allocate_register(f, gen.outputs_reg);
    spe_allocate_register(f, gen.constants_reg);
 
-#if DISASSEM
-   printf("Begin %s\n", __FUNCTION__);
-   tgsi_dump(tokens, 0);
-#endif
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      spe_print_code(f, true);
+      spe_indent(f, 8);
+      printf("Begin %s\n", __FUNCTION__);
+      tgsi_dump(tokens, 0);
+   }
 
    tgsi_parse_init(&parse, tokens);
 
+   emit_prologue(&gen);
+
    while (!tgsi_parse_end_of_tokens(&parse) && !gen.error) {
       tgsi_parse_token(&parse);
 
       switch (parse.FullToken.Token.Type) {
       case TGSI_TOKEN_TYPE_IMMEDIATE:
-#if 0
-         if (!note_immediate(&gen, &parse.FullToken.FullImmediate ))
-            goto fail;
-#endif
+         if (!emit_immediate(&gen, &parse.FullToken.FullImmediate))
+            gen.error = true;
          break;
 
       case TGSI_TOKEN_TYPE_DECLARATION:
-         emit_declaration(&gen, &parse.FullToken.FullDeclaration);
+         if (!emit_declaration(cell, &gen, &parse.FullToken.FullDeclaration))
+            gen.error = true;
          break;
 
       case TGSI_TOKEN_TYPE_INSTRUCTION:
-         if (!emit_instruction(&gen, &parse.FullToken.FullInstruction )) {
+         if (!emit_instruction(&gen, &parse.FullToken.FullInstruction))
             gen.error = true;
-         }
          break;
 
       default:
          assert(0);
-
       }
    }
 
-
    if (gen.error) {
       /* terminate the SPE code */
       return emit_END(&gen);
    }
 
-#if DISASSEM
-   printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst);
-   printf("End %s\n", __FUNCTION__);
-#endif
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst);
+      printf("End %s\n", __FUNCTION__);
+   }
 
    tgsi_parse_free( &parse );
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 06219d4e980..2c64eb1bccd 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -54,12 +54,17 @@
  * \param ifragZ_reg  register containing integer fragment Z values (in)
  * \param ifbZ_reg    register containing integer frame buffer Z values (in/out)
  * \param zmask_reg   register containing result of Z test/comparison (out)
+ *
+ * Returns true if the Z-buffer needs to be updated.
  */
-static void
-gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
-               struct spe_function *f,
+static boolean
+gen_depth_test(struct spe_function *f,
+               const struct pipe_depth_stencil_alpha_state *dsa,
                int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
 {
+   /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_
+    * quantities.  This only makes a difference for 32-bit Z values though.
+    */
    ASSERT(dsa->depth.enabled);
 
    switch (dsa->depth.func) {
@@ -79,28 +84,28 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
 
    case PIPE_FUNC_GREATER:
       /* zmask = (ifragZ > ref) */
-      spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
       /* mask = (mask & zmask) */
       spe_and(f, mask_reg, mask_reg, zmask_reg);
       break;
 
    case PIPE_FUNC_LESS:
       /* zmask = (ref > ifragZ) */
-      spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+      spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
       /* mask = (mask & zmask) */
       spe_and(f, mask_reg, mask_reg, zmask_reg);
       break;
 
    case PIPE_FUNC_LEQUAL:
       /* zmask = (ifragZ > ref) */
-      spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
       /* mask = (mask & ~zmask) */
       spe_andc(f, mask_reg, mask_reg, zmask_reg);
       break;
 
    case PIPE_FUNC_GEQUAL:
       /* zmask = (ref > ifragZ) */
-      spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+      spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
       /* mask = (mask & ~zmask) */
       spe_andc(f, mask_reg, mask_reg, zmask_reg);
       break;
@@ -129,7 +134,10 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
        * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
        */
       spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
+      return true;
    }
+
+   return false;
 }
 
 
@@ -229,7 +237,40 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
    spe_release_register(f, amask_reg);
 }
 
+/* This pair of functions is used inline to allocate and deallocate
+ * optional constant registers.  Once a constant is discovered to be 
+ * needed, we will likely need it again, so we don't want to deallocate
+ * it and have to allocate and load it again unnecessarily.
+ */
+static inline void
+setup_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int *r)
+{
+   if (*is_already_set) return;
+   *r = spe_allocate_available_register(f);
+   *is_already_set = true;
+}
+
+static inline void
+release_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+{
+    if (!*is_already_set) return;
+    spe_release_register(f, r);
+    *is_already_set = false;
+}
+
+static inline void
+setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
+{
+   if (*is_already_set) return;
+   setup_optional_register(f, is_already_set, r);
+   spe_load_float(f, *r, value);
+}
 
+static inline void
+release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+{
+    release_optional_register(f, is_already_set, r);
+}
 
 /**
  * Generate SPE code to implement the given blend mode for a quad of pixels.
@@ -242,6 +283,7 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
  */
 static void
 gen_blend(const struct pipe_blend_state *blend,
+          const struct pipe_blend_color *blend_color,
           struct spe_function *f,
           enum pipe_format color_format,
           int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg,
@@ -262,10 +304,17 @@ gen_blend(const struct pipe_blend_state *blend,
    int fbB_reg = spe_allocate_available_register(f);
    int fbA_reg = spe_allocate_available_register(f);
 
-   int one_reg = spe_allocate_available_register(f);
    int tmp_reg = spe_allocate_available_register(f);
 
-   boolean one_reg_set = false; /* avoid setting one_reg more than once */
+   /* Optional constant registers we might or might not end up using;
+    * if we do use them, make sure we only allocate them once by
+    * keeping a flag on each one.
+    */
+   boolean one_reg_set = false;
+   unsigned int one_reg;
+   boolean constR_reg_set = false, constG_reg_set = false, 
+      constB_reg_set = false, constA_reg_set = false;
+   unsigned int constR_reg, constG_reg, constB_reg, constA_reg;
 
    ASSERT(blend->blend_enable);
 
@@ -346,127 +395,449 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_release_register(f, mask_reg);
    }
 
-
    /*
-    * Compute Src RGB terms
+    * Compute Src RGB terms.  We're actually looking for the value
+    * of (the appropriate RGB factors) * (the incoming source RGB color),
+    * because in some cases (like PIPE_BLENDFACTOR_ONE and 
+    * PIPE_BLENDFACTOR_ZERO) we can avoid doing unnecessary math.
     */
    switch (blend->rgb_src_factor) {
    case PIPE_BLENDFACTOR_ONE:
+      /* factors = (1,1,1), so term = (R,G,B) */
       spe_move(f, term1R_reg, fragR_reg);
       spe_move(f, term1G_reg, fragG_reg);
       spe_move(f, term1B_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_ZERO:
-      spe_zero(f, term1R_reg);
-      spe_zero(f, term1G_reg);
-      spe_zero(f, term1B_reg);
+      /* factors = (0,0,0), so term = (0,0,0) */
+      spe_load_float(f, term1R_reg, 0.0f);
+      spe_load_float(f, term1G_reg, 0.0f);
+      spe_load_float(f, term1B_reg, 0.0f);
       break;
    case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factors = (R,G,B), so term = (R*R, G*G, B*B) */
       spe_fm(f, term1R_reg, fragR_reg, fragR_reg);
       spe_fm(f, term1G_reg, fragG_reg, fragG_reg);
       spe_fm(f, term1B_reg, fragB_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_SRC_ALPHA:
+      /* factors = (A,A,A), so term = (R*A, G*A, B*A) */
       spe_fm(f, term1R_reg, fragR_reg, fragA_reg);
       spe_fm(f, term1G_reg, fragG_reg, fragA_reg);
       spe_fm(f, term1B_reg, fragB_reg, fragA_reg);
       break;
-      /* XXX more cases */
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B)) 
+       * or in other words term = (R-R*R, G-G*G, B-B*B)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fragR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fragG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fragB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factors = (Rfb,Gfb,Bfb), so term = (R*Rfb, G*Gfb, B*Bfb) */
+      spe_fm(f, term1R_reg, fragR_reg, fbR_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fbG_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb))
+       * or term = (R-R*Rfb, G-G*Gfb, B-B*Bfb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fbR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fbG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fbB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A))
+       * or term = (R-R*A,G-G*A,B-B*A)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fragA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fragA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fragA_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      /* factors = (Afb, Afb, Afb), so term = (R*Afb, G*Afb, B*Afb) */
+      spe_fm(f, term1R_reg, fragR_reg, fbA_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fbA_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb)) 
+       * or term = (R-R*Afb,G-G*Afb,b-B*Afb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fbA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fbA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fbA_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */
+      spe_fm(f, term1R_reg, fragR_reg, constR_reg);
+      spe_fm(f, term1G_reg, fragG_reg, constG_reg);
+      spe_fm(f, term1B_reg, fragB_reg, constB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      /* we'll need the optional constant alpha register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */
+      spe_fm(f, term1R_reg, fragR_reg, constA_reg);
+      spe_fm(f, term1G_reg, fragG_reg, constA_reg);
+      spe_fm(f, term1B_reg, fragB_reg, constA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc)) 
+       * or term = (R-R*Rc, G-G*Gc, B-B*Bc)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, constR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, constG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, constB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac))
+       * or term = (R-R*Ac,G-G*Ac,B-B*Ac)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, constA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, constA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, constA_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      /* We'll need the optional {1,1,1,1} register */
+      setup_const_register(f, &one_reg_set, &one_reg, 1.0f);
+      /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so 
+       * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb))
+       * We could expand the term (as a*min(b,c) == min(a*b,a*c)
+       * as long as a is positive), but then we'd have to do three
+       * spe_float_min() functions instead of one, so this is simpler.
+       */
+      /* tmp = 1 - Afb */
+      spe_fs(f, tmp_reg, one_reg, fbA_reg);
+      /* tmp = min(A,tmp) */
+      spe_float_min(f, tmp_reg, fragA_reg, tmp_reg);
+      /* term = R*tmp */
+      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
+      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
+      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+
    default:
       ASSERT(0);
    }
 
    /*
-    * Compute Src Alpha term
+    * Compute Src Alpha term.  Like the above, we're looking for
+    * the full term A*factor, not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
     */
    switch (blend->alpha_src_factor) {
+   case PIPE_BLENDFACTOR_ZERO:
+      /* factor = 0, so term = 0 */
+      spe_load_float(f, term1A_reg, 0.0f);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* fall through */
    case PIPE_BLENDFACTOR_ONE:
+      /* factor = 1, so term = A */
       spe_move(f, term1A_reg, fragA_reg);
       break;
+
    case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factor = A, so term = A*A */
       spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
       break;
    case PIPE_BLENDFACTOR_SRC_ALPHA:
       spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
       break;
-      /* XXX more cases */
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factor = 1-A, so term = A*(1-A) = A-A*A */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, fragA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factor = Afb, so term = A*Afb */
+      spe_fm(f, term1A_reg, fragA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factor = 1-Afb, so term = A*(1-Afb) = A - A*Afb */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, fbA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = Ac, so term = A*Ac */
+      spe_fm(f, term1A_reg, fragA_reg, constA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = 1-Ac, so term = A*(1-Ac) = A-A*Ac */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, constA_reg, fragA_reg);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
    default:
       ASSERT(0);
    }
 
    /*
-    * Compute Dest RGB terms
+    * Compute Dest RGB term.  Like the above, we're looking for
+    * the full term (Rfb,Gfb,Bfb)*(factor), not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
     */
    switch (blend->rgb_dst_factor) {
    case PIPE_BLENDFACTOR_ONE:
+      /* factors = (1,1,1), so term = (Rfb,Gfb,Bfb) */
       spe_move(f, term2R_reg, fbR_reg);
       spe_move(f, term2G_reg, fbG_reg);
       spe_move(f, term2B_reg, fbB_reg);
       break;
    case PIPE_BLENDFACTOR_ZERO:
-      spe_zero(f, term2R_reg);
-      spe_zero(f, term2G_reg);
-      spe_zero(f, term2B_reg);
+      /* factor s= (0,0,0), so term = (0,0,0) */
+      spe_load_float(f, term2R_reg, 0.0f);
+      spe_load_float(f, term2G_reg, 0.0f);
+      spe_load_float(f, term2B_reg, 0.0f);
       break;
    case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factors = (R,G,B), so term = (R*Rfb, G*Gfb, B*Bfb) */
       spe_fm(f, term2R_reg, fbR_reg, fragR_reg);
       spe_fm(f, term2G_reg, fbG_reg, fragG_reg);
       spe_fm(f, term2B_reg, fbB_reg, fragB_reg);
       break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factors = (1-R,1-G,1-B), so term = (Rfb*(1-R), Gfb*(1-G), Bfb*(1-B)) 
+       * or in other words term = (Rfb-Rfb*R, Gfb-Gfb*G, Bfb-Bfb*B)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fragR_reg, fbR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fragG_reg, fbG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fragB_reg, fbB_reg, fbB_reg);
+      break;
    case PIPE_BLENDFACTOR_SRC_ALPHA:
+      /* factors = (A,A,A), so term = (Rfb*A, Gfb*A, Bfb*A) */
       spe_fm(f, term2R_reg, fbR_reg, fragA_reg);
       spe_fm(f, term2G_reg, fbG_reg, fragA_reg);
       spe_fm(f, term2B_reg, fbB_reg, fragA_reg);
       break;
    case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-      /* one = {1.0, 1.0, 1.0, 1.0} */
-      if (!one_reg_set) {
-         spe_load_float(f, one_reg, 1.0f);
-         one_reg_set = true;
-      }
-      /* tmp = one - fragA */
-      spe_fs(f, tmp_reg, one_reg, fragA_reg);
-      /* term = fb * tmp */
-      spe_fm(f, term2R_reg, fbR_reg, tmp_reg);
-      spe_fm(f, term2G_reg, fbG_reg, tmp_reg);
-      spe_fm(f, term2B_reg, fbB_reg, tmp_reg);
-      break;
-      /* XXX more cases */
+      /* factors = (1-A,1-A,1-A) so term = (Rfb-Rfb*A,Gfb-Gfb*A,Bfb-Bfb*A) */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2R_reg, fbR_reg, fragA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fragA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fragA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factors = (Rfb,Gfb,Bfb), so term = (Rfb*Rfb, Gfb*Gfb, Bfb*Bfb) */
+      spe_fm(f, term2R_reg, fbR_reg, fbR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fbG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (Rfb*(1-Rfb),Gfb*(1-Gfb),Bfb*(1-Bfb))
+       * or term = (Rfb-Rfb*Rfb, Gfb-Gfb*Gfb, Bfb-Bfb*Bfb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, fbR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fbG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fbB_reg, fbB_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      /* factors = (Afb, Afb, Afb), so term = (Rfb*Afb, Gfb*Afb, Bfb*Afb) */
+      spe_fm(f, term2R_reg, fbR_reg, fbA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fbA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (Rfb*(1-Afb),Gfb*(1-Afb),Bfb*(1-Afb)) 
+       * or term = (Rfb-Rfb*Afb,Gfb-Gfb*Afb,Bfb-Bfb*Afb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, fbA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fbA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fbA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* now, factor = (Rc,Gc,Bc), so term = (Rfb*Rc,Gfb*Gc,Bfb*Bc) */
+      spe_fm(f, term2R_reg, fbR_reg, constR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, constG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, constB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      /* we'll need the optional constant alpha register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = (Ac,Ac,Ac), so term = (Rfb*Ac,Gfb*Ac,Bfb*Ac) */
+      spe_fm(f, term2R_reg, fbR_reg, constA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, constA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, constA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Rc,1-Gc,1-Bc), so term = (Rfb*(1-Rc),Gfb*(1-Gc),Bfb*(1-Bc)) 
+       * or term = (Rfb-Rfb*Rc, Gfb-Gfb*Gc, Bfb-Bfb*Bc)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, constR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, constG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, constB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Ac,1-Ac,1-Ac), so term = (Rfb*(1-Ac),Gfb*(1-Ac),Bfb*(1-Ac))
+       * or term = (Rfb-Rfb*Ac,Gfb-Gfb*Ac,Bfb-Bfb*Ac)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, constA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, constA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, constA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest RGB */
+      ASSERT(0);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+
    default:
       ASSERT(0);
    }
 
    /*
-    * Compute Dest Alpha term
+    * Compute Dest Alpha term.  Like the above, we're looking for
+    * the full term Afb*factor, not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
     */
    switch (blend->alpha_dst_factor) {
    case PIPE_BLENDFACTOR_ONE:
+      /* factor = 1, so term = Afb */
       spe_move(f, term2A_reg, fbA_reg);
       break;
    case PIPE_BLENDFACTOR_ZERO:
-      spe_zero(f, term2A_reg);
+      /* factor = 0, so term = 0 */
+      spe_load_float(f, term2A_reg, 0.0f);
       break;
-   case PIPE_BLENDFACTOR_SRC_ALPHA:
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factor = A, so term = Afb*A */
       spe_fm(f, term2A_reg, fbA_reg, fragA_reg);
       break;
-   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-      /* one = {1.0, 1.0, 1.0, 1.0} */
-      if (!one_reg_set) {
-         spe_load_float(f, one_reg, 1.0f);
-         one_reg_set = true;
-      }
-      /* tmp = one - fragA */
-      spe_fs(f, tmp_reg, one_reg, fragA_reg);
-      /* termA = fbA * tmp */
-      spe_fm(f, term2A_reg, fbA_reg, tmp_reg);
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factor = 1-A, so term = Afb*(1-A) = Afb-Afb*A */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, fragA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factor = Afb, so term = Afb*Afb */
+      spe_fm(f, term2A_reg, fbA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factor = 1-Afb, so term = Afb*(1-Afb) = Afb - Afb*Afb */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, fbA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = Ac, so term = Afb*Ac */
+      spe_fm(f, term2A_reg, fbA_reg, constA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = 1-Ac, so term = Afb*(1-Ac) = Afb-Afb*Ac */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, constA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest alpha */
+      ASSERT(0);
       break;
-      /* XXX more cases */
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
    default:
       ASSERT(0);
    }
 
    /*
-    * Combine Src/Dest RGB terms
+    * Combine Src/Dest RGB terms as per the blend equation.
     */
    switch (blend->rgb_func) {
    case PIPE_BLEND_ADD:
@@ -479,7 +850,21 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fs(f, fragG_reg, term1G_reg, term2G_reg);
       spe_fs(f, fragB_reg, term1B_reg, term2B_reg);
       break;
-      /* XXX more cases */
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      spe_fs(f, fragR_reg, term2R_reg, term1R_reg);
+      spe_fs(f, fragG_reg, term2G_reg, term1G_reg);
+      spe_fs(f, fragB_reg, term2B_reg, term1B_reg);
+      break;
+   case PIPE_BLEND_MIN:
+      spe_float_min(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_float_min(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_float_min(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+   case PIPE_BLEND_MAX:
+      spe_float_max(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_float_max(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_float_max(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
    default:
       ASSERT(0);
    }
@@ -494,7 +879,15 @@ gen_blend(const struct pipe_blend_state *blend,
    case PIPE_BLEND_SUBTRACT:
       spe_fs(f, fragA_reg, term1A_reg, term2A_reg);
       break;
-      /* XXX more cases */
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      spe_fs(f, fragA_reg, term2A_reg, term1A_reg);
+      break;
+   case PIPE_BLEND_MIN:
+      spe_float_min(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+   case PIPE_BLEND_MAX:
+      spe_float_max(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
    default:
       ASSERT(0);
    }
@@ -514,8 +907,14 @@ gen_blend(const struct pipe_blend_state *blend,
    spe_release_register(f, fbB_reg);
    spe_release_register(f, fbA_reg);
 
-   spe_release_register(f, one_reg);
    spe_release_register(f, tmp_reg);
+
+   /* Free any optional registers that actually got used */
+   release_const_register(f, &one_reg_set, one_reg);
+   release_const_register(f, &constR_reg_set, constR_reg);
+   release_const_register(f, &constG_reg_set, constG_reg);
+   release_const_register(f, &constB_reg_set, constB_reg);
+   release_const_register(f, &constA_reg_set, constA_reg);
 }
 
 
@@ -524,24 +923,74 @@ gen_logicop(const struct pipe_blend_state *blend,
             struct spe_function *f,
             int fragRGBA_reg, int fbRGBA_reg)
 {
-   /* XXX to-do */
-   /* operate on 32-bit packed pixels, not float colors */
-}
-
-
-static void
-gen_colormask(uint colormask,
-              struct spe_function *f,
-              int fragRGBA_reg, int fbRGBA_reg)
-{
-   /* XXX to-do */
-   /* operate on 32-bit packed pixels, not float colors */
+   /* We've got four 32-bit RGBA packed pixels in each of
+    * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+    * reds, greens, blues, and alphas.
+    * */
+   ASSERT(blend->logicop_enable);
+
+   switch(blend->logicop_func) {
+      case PIPE_LOGICOP_CLEAR: /* 0 */
+         spe_zero(f, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NOR: /* ~(s | d) */
+         spe_nor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND_INVERTED: /* ~s & d */
+         /* andc R, A, B computes R = A & ~B */
+         spe_andc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_COPY_INVERTED: /* ~s */
+         spe_complement(f, fragRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND_REVERSE: /* s & ~d */
+         /* andc R, A, B computes R = A & ~B */
+         spe_andc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_INVERT: /* ~d */
+         /* Note that (A nor A) == ~(A|A) == ~A */
+         spe_nor(f, fragRGBA_reg, fbRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_XOR: /* s ^ d */
+         spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NAND: /* ~(s & d) */
+         spe_nand(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND: /* s & d */
+         spe_and(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_EQUIV: /* ~(s ^ d) */
+         spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         spe_complement(f, fragRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NOOP: /* d */
+         spe_move(f, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_OR_INVERTED: /* ~s | d */
+         /* orc R, A, B computes R = A | ~B */
+         spe_orc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_COPY: /* s */
+         break;
+      case PIPE_LOGICOP_OR_REVERSE: /* s | ~d */
+         /* orc R, A, B computes R = A | ~B */
+         spe_orc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_OR: /* s | d */
+         spe_or(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_SET: /* 1 */
+         spe_load_int(f, fragRGBA_reg, 0xffffffff);
+         break;
+      default:
+         ASSERT(0);
+   }
 }
 
 
-
 /**
- * Generate code to pack a quad of float colors into a four 32-bit integers.
+ * Generate code to pack a quad of float colors into four 32-bit integers.
  *
  * \param f             SPE function to append instruction onto.
  * \param color_format  the dest color packing format
@@ -557,13 +1006,16 @@ gen_pack_colors(struct spe_function *f,
                 int r_reg, int g_reg, int b_reg, int a_reg,
                 int rgba_reg)
 {
+   int rg_reg = spe_allocate_available_register(f);
+   int ba_reg = spe_allocate_available_register(f);
+
    /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
    spe_cfltu(f, r_reg, r_reg, 32);
    spe_cfltu(f, g_reg, g_reg, 32);
    spe_cfltu(f, b_reg, b_reg, 32);
    spe_cfltu(f, a_reg, a_reg, 32);
 
-   /* Shift the most significant bytes to least the significant positions.
+   /* Shift the most significant bytes to the least significant positions.
     * I.e.: reg = reg >> 24
     */
    spe_rotmi(f, r_reg, r_reg, -24);
@@ -595,12 +1047,713 @@ gen_pack_colors(struct spe_function *f,
     * OR-ing all those together gives us four packed colors:
     *  RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
     */
-   spe_or(f, rgba_reg, r_reg, g_reg);
-   spe_or(f, rgba_reg, rgba_reg, b_reg);
-   spe_or(f, rgba_reg, rgba_reg, a_reg);
+   spe_or(f, rg_reg, r_reg, g_reg);
+   spe_or(f, ba_reg, a_reg, b_reg);
+   spe_or(f, rgba_reg, rg_reg, ba_reg);
+
+   spe_release_register(f, rg_reg);
+   spe_release_register(f, ba_reg);
+}
+
+static void
+gen_colormask(struct spe_function *f,
+              uint colormask,
+              enum pipe_format color_format,
+              int fragRGBA_reg, int fbRGBA_reg)
+{
+   /* We've got four 32-bit RGBA packed pixels in each of
+    * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+    * reds, greens, blues, and alphas.  Further, the pixels
+    * are packed according to the given color format, not
+    * necessarily RGBA...
+    */
+   unsigned int r_mask;
+   unsigned int g_mask;
+   unsigned int b_mask;
+   unsigned int a_mask;
+
+   /* Calculate exactly where the bits for any particular color
+    * end up, so we can mask them correctly.
+    */
+   switch(color_format) {
+      case PIPE_FORMAT_A8R8G8B8_UNORM:
+         /* ARGB */
+         a_mask = 0xff000000;
+         r_mask = 0x00ff0000;
+         g_mask = 0x0000ff00;
+         b_mask = 0x000000ff;
+         break;
+      case PIPE_FORMAT_B8G8R8A8_UNORM:
+         /* BGRA */
+         b_mask = 0xff000000;
+         g_mask = 0x00ff0000;
+         r_mask = 0x0000ff00;
+         a_mask = 0x000000ff;
+         break;
+      default:
+         ASSERT(0);
+   }
+
+   /* For each R, G, B, and A component we're supposed to mask out, 
+    * clear its bits.   Then our mask operation later will work 
+    * as expected.
+    */
+   if (!(colormask & PIPE_MASK_R)) {
+      r_mask = 0;
+   }
+   if (!(colormask & PIPE_MASK_G)) {
+      g_mask = 0;
+   }
+   if (!(colormask & PIPE_MASK_B)) {
+      b_mask = 0;
+   }
+   if (!(colormask & PIPE_MASK_A)) {
+      a_mask = 0;
+   }
+
+   /* Get a temporary register to hold the mask that will be applied to the fragment */
+   int colormask_reg = spe_allocate_available_register(f);
+
+   /* The actual mask we're going to use is an OR of the remaining R, G, B, and A
+    * masks.  Load the result value into our temporary register.
+    */
+   spe_load_uint(f, colormask_reg, r_mask | g_mask | b_mask | a_mask);
+
+   /* Use the mask register to select between the fragment color
+    * values and the frame buffer color values.  Wherever the
+    * mask has a 0 bit, the current frame buffer color should override
+    * the fragment color.  Wherever the mask has a 1 bit, the 
+    * fragment color should persevere.  The Select Bits (selb rt, rA, rB, rM)
+    * instruction will select bits from its first operand rA wherever the
+    * the mask bits rM are 0, and from its second operand rB wherever the
+    * mask bits rM are 1.  That means that the frame buffer color is the
+    * first operand, and the fragment color the second.
+    */
+    spe_selb(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg, colormask_reg);
+
+    /* Release the temporary register and we're done */
+    spe_release_register(f, colormask_reg);
 }
 
+/* This function is annoyingly similar to gen_depth_test(), above, except
+ * that instead of comparing two varying values (i.e. fragment and buffer),
+ * we're comparing a varying value with a static value.  As such, we have
+ * access to the Compare Immediate instructions where we don't in 
+ * gen_depth_test(), which is what makes us very different.
+ *
+ * There's some added complexity if there's a non-trivial state->mask
+ * value; then stencil and reference both must be masked
+ *
+ * The return value in the stencil_pass_reg is a bitmask of valid
+ * fragments that also passed the stencil test.  The bitmask of valid
+ * fragments that failed would be found in (fragment_mask_reg & ~stencil_pass_reg).
+ */
+static void
+gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state, 
+                 unsigned int stencil_max_value,
+                 unsigned int fragment_mask_reg, unsigned int fbS_reg, 
+                 unsigned int stencil_pass_reg)
+{
+   /* Generate code that puts the set of passing fragments into the stencil_pass_reg
+    * register, taking into account whether each fragment was active to begin with.
+    */
+   switch (state->func) {
+   case PIPE_FUNC_EQUAL:
+      if (state->value_mask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (s == reference) */
+         spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ((s&mask) == (reference&mask)) */
+         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+         spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
 
+   case PIPE_FUNC_NOTEQUAL:
+      if (state->value_mask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & ~(s == reference) */
+         spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ~((s&mask) == (reference&mask)) */
+         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+         spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_LESS:
+      if (state->value_mask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (reference < s)  */
+         spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ((reference&mask) < (s & mask)) */
+         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+         spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_GREATER:
+      if (state->value_mask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (reference > s) */
+         /* There's no convenient Compare Less Than Immediate instruction, so
+          * we'll have to do this one the harder way, by loading a register and 
+          * comparing directly.  Compare Logical Greater Than Word (clgt) 
+          * treats its operands as unsigned - no sign extension.
+          */
+         unsigned int tmp_reg = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, state->ref_value);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ((reference&mask) > (s&mask)) */
+         unsigned int tmp_reg = spe_allocate_available_register(f);
+         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, state->value_mask & state->ref_value);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_GEQUAL:
+      if (state->value_mask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (reference >= s) 
+          *              = fragment_mask & ~(s > reference) */
+         spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ~((s&mask) > (reference&mask)) */
+         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+         spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      if (state->value_mask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (reference <= s) ]
+          *               = fragment_mask & ~(reference > s) */
+         /* As above, we have to do this by loading a register */
+         unsigned int tmp_reg = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, state->ref_value);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ~((reference&mask) > (s&mask)) */
+         unsigned int tmp_reg = spe_allocate_available_register(f);
+         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, state->ref_value & state->value_mask);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_NEVER:
+      /* stencil_pass = fragment_mask & 0 = 0 */
+      spe_load_uint(f, stencil_pass_reg, 0);
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* stencil_pass = fragment_mask & 1 = fragment_mask */
+      spe_move(f, stencil_pass_reg, fragment_mask_reg);
+      break;
+   }
+
+   /* The fragments that passed the stencil test are now in stencil_pass_reg.
+    * The fragments that failed would be (fragment_mask_reg & ~stencil_pass_reg).
+    */
+}
+
+/* This function generates code that calculates a set of new stencil values
+ * given the earlier values and the operation to apply.  It does not
+ * apply any tests.  It is intended to be called up to 3 times
+ * (for the stencil fail operation, for the stencil pass-z fail operation,
+ * and for the stencil pass-z pass operation) to collect up to three
+ * possible sets of values, and for the caller to combine them based
+ * on the result of the tests.
+ *
+ * stencil_max_value should be (2^n - 1) where n is the number of bits
+ * in the stencil buffer - in other words, it should be usable as a mask.
+ */
+static void
+gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
+                   unsigned int stencil_ref_value, unsigned int stencil_max_value,
+                   unsigned int fbS_reg, unsigned int newS_reg)
+{
+   /* The code below assumes that newS_reg and fbS_reg are not the same
+    * register; if they can be, the calculations below will have to use
+    * an additional temporary register.  For now, mark the assumption
+    * with an assertion that will fail if they are the same.
+    */
+   ASSERT(fbS_reg != newS_reg);
+
+   /* The code also assumes the the stencil_max_value is of the form 
+    * 2^n-1 and can therefore be used as a mask for the valid bits in 
+    * addition to a maximum.  Make sure this is the case as well.
+    * The clever math below exploits the fact that incrementing a 
+    * binary number serves to flip all the bits of a number starting at
+    * the LSB and continuing to (and including) the first zero bit
+    * found.  That means that a number and its increment will always
+    * have at least one bit in common (the high order bit, if nothing
+    * else) *unless* the number is zero, *or* the number is of a form
+    * consisting of some number of 1s in the low-order bits followed
+    * by nothing but 0s in the high-order bits.  The latter case
+    * implies it's of the form 2^n-1.
+    */
+   ASSERT(stencil_max_value > 0 && ((stencil_max_value + 1) & stencil_max_value) == 0);
+
+   switch(stencil_op) {
+   case PIPE_STENCIL_OP_KEEP:
+      /* newS = S */
+      spe_move(f, newS_reg, fbS_reg);
+      break;
+
+   case PIPE_STENCIL_OP_ZERO:
+      /* newS = 0 */
+      spe_zero(f, newS_reg);
+      break;
+
+   case PIPE_STENCIL_OP_REPLACE:
+      /* newS = stencil reference value */
+      spe_load_uint(f, newS_reg, stencil_ref_value);
+      break;
+
+   case PIPE_STENCIL_OP_INCR: {
+      /* newS = (s == max ? max : s + 1) */
+      unsigned int equals_reg = spe_allocate_available_register(f);
+
+      spe_compare_equal_uint(f, equals_reg, fbS_reg, stencil_max_value);
+      /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
+      spe_ai(f, newS_reg, fbS_reg, 1);
+      /* Select from the current value or the new value based on the equality test */
+      spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
+
+      spe_release_register(f, equals_reg);
+      break;
+   }
+   case PIPE_STENCIL_OP_DECR: {
+      /* newS = (s == 0 ? 0 : s - 1) */
+      unsigned int equals_reg = spe_allocate_available_register(f);
+
+      spe_compare_equal_uint(f, equals_reg, fbS_reg, 0);
+      /* Add Word Immediate with a (-1) value works */
+      spe_ai(f, newS_reg, fbS_reg, -1);
+      /* Select from the current value or the new value based on the equality test */
+      spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
+
+      spe_release_register(f, equals_reg);
+      break;
+   }
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      /* newS = (s == max ? 0 : s + 1), but since max is 2^n-1, we can
+       * do a normal add and mask off the correct bits 
+       */
+      spe_ai(f, newS_reg, fbS_reg, 1);
+      spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+      break;
+
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      /* newS = (s == 0 ? max : s - 1), but we'll pull the same mask trick as above */
+      spe_ai(f, newS_reg, fbS_reg, -1);
+      spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+      break;
+
+   case PIPE_STENCIL_OP_INVERT:
+      /* newS = ~s.  We take advantage of the mask/max value to invert only
+       * the valid bits for the field so we don't have to do an extra "and".
+       */
+      spe_xor_uint(f, newS_reg, fbS_reg, stencil_max_value);
+      break;
+
+   default:
+      ASSERT(0);
+   }
+}
+
+
+/* This function generates code to get all the necessary possible
+ * stencil values.  For each of the output registers (fail_reg,
+ * zfail_reg, and zpass_reg), it either allocates a new register
+ * and calculates a new set of values based on the stencil operation,
+ * or it reuses a register allocation and calculation done for an
+ * earlier (matching) operation, or it reuses the fbS_reg register
+ * (if the stencil operation is KEEP, which doesn't change the 
+ * stencil buffer).
+ *
+ * Since this function allocates a variable number of registers,
+ * to avoid incurring complex logic to free them, they should
+ * be allocated after a spe_allocate_register_set() call
+ * and released by the corresponding spe_release_register_set() call.
+ */
+static void
+gen_get_stencil_values(struct spe_function *f, const struct pipe_stencil_state *stencil,
+                       const unsigned int depth_enabled,
+                       unsigned int fbS_reg, 
+                       unsigned int *fail_reg, unsigned int *zfail_reg, 
+                       unsigned int *zpass_reg)
+{
+   unsigned zfail_op;
+
+   /* Stenciling had better be enabled here */
+   ASSERT(stencil->enabled);
+
+   /* If the depth test is not enabled, it is treated as though it always
+    * passes, which means that the zfail_op is not considered - a
+    * failing stencil test triggers the fail_op, and a passing one
+    * triggers the zpass_op
+    *
+    * As an optimization, override calculation of the zfail_op values
+    * if they aren't going to be used.  By setting the value of
+    * the operation to PIPE_STENCIL_OP_KEEP, its value will be assumed
+    * to match the incoming stencil values, and no calculation will
+    * be done.
+    */
+   if (depth_enabled) {
+      zfail_op = stencil->zfail_op;
+   }
+   else {
+      zfail_op = PIPE_STENCIL_OP_KEEP;
+   }
+
+   /* One-sided or front-facing stencil */
+   if (stencil->fail_op == PIPE_STENCIL_OP_KEEP) {
+      *fail_reg = fbS_reg;
+   }
+   else {
+      *fail_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, stencil->fail_op, stencil->ref_value, 
+         0xff, fbS_reg, *fail_reg);
+   }
+
+   /* Check the possibly overridden value, not the structure value */
+   if (zfail_op == PIPE_STENCIL_OP_KEEP) {
+      *zfail_reg = fbS_reg;
+   }
+   else if (zfail_op == stencil->fail_op) {
+      *zfail_reg = *fail_reg;
+   }
+   else {
+      *zfail_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, stencil->zfail_op, stencil->ref_value, 
+         0xff, fbS_reg, *zfail_reg);
+   }
+
+   if (stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
+      *zpass_reg = fbS_reg;
+   }
+   else if (stencil->zpass_op == stencil->fail_op) {
+      *zpass_reg = *fail_reg;
+   }
+   else if (stencil->zpass_op == zfail_op) {
+      *zpass_reg = *zfail_reg;
+   }
+   else {
+      *zpass_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, stencil->zpass_op, stencil->ref_value, 
+         0xff, fbS_reg, *zpass_reg);
+   }
+}
+
+/* Note that fbZ_reg may *not* be set on entry, if in fact
+ * the depth test is not enabled.  This function must not use
+ * the register if depth is not enabled.
+ */
+static boolean
+gen_stencil_depth_test(struct spe_function *f, 
+                       const struct pipe_depth_stencil_alpha_state *dsa, 
+                       const uint facing,
+                       const int mask_reg, const int fragZ_reg, 
+                       const int fbZ_reg, const int fbS_reg)
+{
+   /* True if we've generated code that could require writeback to the
+    * depth and/or stencil buffers
+    */
+   boolean modified_buffers = false;
+
+   boolean need_to_calculate_stencil_values;
+   boolean need_to_writemask_stencil_values;
+
+   struct pipe_stencil_state *stencil;
+
+   /* Registers.  We may or may not actually allocate these, depending
+    * on whether the state values indicate that we need them.
+    */
+   unsigned int stencil_pass_reg, stencil_fail_reg;
+   unsigned int stencil_fail_values, stencil_pass_depth_fail_values, stencil_pass_depth_pass_values;
+   unsigned int stencil_writemask_reg;
+   unsigned int zmask_reg;
+   unsigned int newS_reg;
+
+   /* Stenciling is quite complex: up to six different configurable stencil 
+    * operations/calculations can be required (three each for front-facing
+    * and back-facing fragments).  Many of those operations will likely 
+    * be identical, so there's good reason to try to avoid calculating 
+    * the same values more than once (which unfortunately makes the code less 
+    * straightforward).
+    *
+    * To make register management easier, we start a new 
+    * register set; we can release all the registers in the set at
+    * once, and avoid having to keep track of exactly which registers
+    * we allocate.  We can still allocate and free registers as 
+    * desired (if we know we no longer need a register), but we don't
+    * have to spend the complexity to track the more difficult variant
+    * register usage scenarios.
+    */
+   spe_comment(f, 0, "Allocating stencil register set");
+   spe_allocate_register_set(f);
+
+   /* The facing we're given is the fragment facing; it doesn't
+    * exactly match the stencil facing.  If stencil is enabled,
+    * but two-sided stencil is *not* enabled, we use the same
+    * stencil settings for both front- and back-facing fragments.
+    * We only use the "back-facing" stencil for backfacing fragments
+    * if two-sided stenciling is enabled.
+    */
+   if (facing == CELL_FACING_BACK && dsa->stencil[1].enabled) {
+      stencil = &dsa->stencil[1];
+   }
+   else {
+      stencil = &dsa->stencil[0];
+   }
+
+   /* Calculate the writemask.  If the writemask is trivial (either
+    * all 0s, meaning that we don't need to calculate any stencil values
+    * because they're not going to change the stencil anyway, or all 1s,
+    * meaning that we have to calculate the stencil values but do not
+    * need to mask them), we can avoid generating code.  Don't forget
+    * that we need to consider backfacing stencil, if enabled.
+    *
+    * Note that if the backface stencil is *not* enabled, the backface
+    * stencil will have the same values as the frontface stencil.
+    */
+   if (stencil->fail_op == PIPE_STENCIL_OP_KEEP &&
+       stencil->zfail_op == PIPE_STENCIL_OP_KEEP &&
+       stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
+       need_to_calculate_stencil_values = false;
+       need_to_writemask_stencil_values = false;
+    }
+    else if (stencil->write_mask == 0x0) {
+      /* All changes are writemasked out, so no need to calculate
+       * what those changes might be, and no need to write anything back.
+       */
+      need_to_calculate_stencil_values = false;
+      need_to_writemask_stencil_values = false;
+   }
+   else if (stencil->write_mask == 0xff) {
+      /* Still trivial, but a little less so.  We need to write the stencil
+       * values, but we don't need to mask them.
+       */
+      need_to_calculate_stencil_values = true;
+      need_to_writemask_stencil_values = false;
+   }
+   else {
+      /* The general case: calculate, mask, and write */
+      need_to_calculate_stencil_values = true;
+      need_to_writemask_stencil_values = true;
+
+      /* While we're here, generate code that calculates what the
+       * writemask should be.  If backface stenciling is enabled,
+       * and the backface writemask is not the same as the frontface
+       * writemask, we'll have to generate code that merges the
+       * two masks into a single effective mask based on fragment facing.
+       */
+      spe_comment(f, 0, "Computing stencil writemask");
+      stencil_writemask_reg = spe_allocate_available_register(f);
+      spe_load_uint(f, stencil_writemask_reg, dsa->stencil[facing].write_mask);
+   }
+
+   /* At least one-sided stenciling must be on.  Generate code that
+    * runs the stencil test on the basic/front-facing stencil, leaving
+    * the mask of passing stencil bits in stencil_pass_reg.  This mask will
+    * be used both to mask the set of active pixels, and also to
+    * determine how the stencil buffer changes.
+    *
+    * This test will *not* change the value in mask_reg (because we don't
+    * yet know whether to apply the two-sided stencil or one-sided stencil).
+    */
+   spe_comment(f, 0, "Running basic stencil test");
+   stencil_pass_reg = spe_allocate_available_register(f);
+   gen_stencil_test(f, stencil, 0xff, mask_reg, fbS_reg, stencil_pass_reg);
+
+   /* Generate code that, given the mask of valid fragments and the
+    * mask of valid fragments that passed the stencil test, computes
+    * the mask of valid fragments that failed the stencil test.  We
+    * have to do this before we run a depth test (because the
+    * depth test should not be performed on fragments that failed the
+    * stencil test, and because the depth test will update the 
+    * mask of valid fragments based on the results of the depth test).
+    */
+   spe_comment(f, 0, "Computing stencil fail mask and updating fragment mask");
+   stencil_fail_reg = spe_allocate_available_register(f);
+   spe_andc(f, stencil_fail_reg, mask_reg, stencil_pass_reg);
+   /* Now remove the stenciled-out pixels from the valid fragment mask,
+    * so we can later use the valid fragment mask in the depth test.
+    */
+   spe_and(f, mask_reg, mask_reg, stencil_pass_reg);
+
+   /* We may not need to calculate stencil values, if the writemask is off */
+   if (need_to_calculate_stencil_values) {
+      /* Generate code that calculates exactly which stencil values we need,
+       * without calculating the same value twice (say, if two different
+       * stencil ops have the same value).  This code will work for one-sided
+       * and two-sided stenciling (so that we take into account that operations
+       * may match between front and back stencils), and will also take into
+       * account whether the depth test is enabled (if the depth test is off,
+       * we don't need any of the zfail results, because the depth test always
+       * is considered to pass if it is disabled).  Any register value that
+       * does not need to be calculated will come back with the same value
+       * that's in fbS_reg.
+       *
+       * This function will allocate a variant number of registers that
+       * will be released as part of the register set.
+       */
+      spe_comment(f, 0, facing == CELL_FACING_FRONT ? "Computing front-facing stencil values" : "Computing back-facing stencil values");
+      gen_get_stencil_values(f, stencil, dsa->depth.enabled, fbS_reg, 
+         &stencil_fail_values, &stencil_pass_depth_fail_values, 
+         &stencil_pass_depth_pass_values);
+   }  
+
+   /* We now have all the stencil values we need.  We also need 
+    * the results of the depth test to figure out which
+    * stencil values will become the new stencil values.  (Even if
+    * we aren't actually calculating stencil values, we need to apply
+    * the depth test if it's enabled.)
+    *
+    * The code generated by gen_depth_test() returns the results of the
+    * test in the given register, but also alters the mask_reg based
+    * on the results of the test.
+    */
+   if (dsa->depth.enabled) {
+      spe_comment(f, 0, "Running stencil depth test");
+      zmask_reg = spe_allocate_available_register(f);
+      modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+   }
+
+   if (need_to_calculate_stencil_values) {
+
+      /* If we need to writemask the stencil values before going into
+       * the stencil buffer, we'll have to use a new register to
+       * hold the new values.  If not, we can just keep using the
+       * current register.
+       */
+      if (need_to_writemask_stencil_values) {
+         newS_reg = spe_allocate_available_register(f);
+         spe_comment(f, 0, "Saving current stencil values for writemasking");
+         spe_move(f, newS_reg, fbS_reg);
+      }
+      else {
+         newS_reg = fbS_reg;
+      }
+
+      /* Merge in the selected stencil fail values */
+      if (stencil_fail_values != fbS_reg) {
+         spe_comment(f, 0, "Loading stencil fail values");
+         spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg);
+         modified_buffers = true;
+      }
+
+      /* Same for the stencil pass/depth fail values.  If this calculation
+       * is not needed (say, if depth test is off), then the
+       * stencil_pass_depth_fail_values register will be equal to fbS_reg
+       * and we'll skip the calculation.
+       */
+      if (stencil_pass_depth_fail_values != fbS_reg) {
+         /* We don't actually have a stencil pass/depth fail mask yet.
+          * Calculate it here from the stencil passing mask and the
+          * depth passing mask.  Note that zmask_reg *must* have been
+          * set above if we're here.
+          */
+         unsigned int stencil_pass_depth_fail_mask = spe_allocate_available_register(f);
+         spe_comment(f, 0, "Loading stencil pass/depth fail values");
+         spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg);
+
+         spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values, stencil_pass_depth_fail_mask);
+
+         spe_release_register(f, stencil_pass_depth_fail_mask);
+         modified_buffers = true;
+      }
+
+      /* Same for the stencil pass/depth pass mask.  Note that we
+       * *can* get here with zmask_reg being unset (if the depth
+       * test is off but the stencil test is on).  In this case,
+       * we assume the depth test passes, and don't need to mask
+       * the stencil pass mask with the Z mask.
+       */
+      if (stencil_pass_depth_pass_values != fbS_reg) {
+         if (dsa->depth.enabled) {
+            unsigned int stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
+            /* We'll need a separate register */
+            spe_comment(f, 0, "Loading stencil pass/depth pass values");
+            spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg);
+            spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask);
+            spe_release_register(f, stencil_pass_depth_pass_mask);
+         }
+         else {
+            /* We can use the same stencil-pass register */
+            spe_comment(f, 0, "Loading stencil pass values");
+            spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_reg);
+         }
+         modified_buffers = true;
+      }
+
+      /* Almost done.  If we need to writemask, do it now, leaving the
+       * results in the fbS_reg register passed in.  If we don't need
+       * to writemask, then the results are *already* in the fbS_reg,
+       * so there's nothing more to do.
+       */
+
+      if (need_to_writemask_stencil_values && modified_buffers) {
+         /* The Select Bytes command makes a fine writemask.  Where
+          * the mask is 0, the first (original) values are retained,
+          * effectively masking out changes.  Where the mask is 1, the
+          * second (new) values are retained, incorporating changes.
+          */
+         spe_comment(f, 0, "Writemasking new stencil values");
+         spe_selb(f, fbS_reg, fbS_reg, newS_reg, stencil_writemask_reg);
+      }
+
+   } /* done calculating stencil values */
+
+   /* The stencil and/or depth values have been applied, and the
+    * mask_reg, fbS_reg, and fbZ_reg values have been updated.
+    * We're all done, except that we've allocated a fair number
+    * of registers that we didn't bother tracking.  Release all
+    * those registers as part of the register set, and go home.
+    */
+   spe_comment(f, 0, "Releasing stencil register set");
+   spe_release_register_set(f);
+
+   /* Return true if we could have modified the stencil and/or
+    * depth buffers.
+    */
+   return modified_buffers;
+}
 
 
 /**
@@ -621,14 +1774,19 @@ gen_pack_colors(struct spe_function *f,
  * should be much faster.
  *
  * \param cell  the rendering context (in)
- * \param f     the generated function (out)
+ * \param facing whether the generated code is for front-facing or 
+ *              back-facing fragments
+ * \param f     the generated function (in/out); on input, the function
+ *              must already have been initialized.  On exit, whatever
+ *              instructions within the generated function have had
+ *              the fragment ops appended.
  */
 void
-cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
+cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct spe_function *f)
 {
-   const struct pipe_depth_stencil_alpha_state *dsa =
-      &cell->depth_stencil->base;
-   const struct pipe_blend_state *blend = &cell->blend->base;
+   const struct pipe_depth_stencil_alpha_state *dsa = cell->depth_stencil;
+   const struct pipe_blend_state *blend = cell->blend;
+   const struct pipe_blend_color *blend_color = &cell->blend_color;
    const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
 
    /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
@@ -643,6 +1801,8 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    const int fragA_reg = 11;  /* vector float */
    const int mask_reg = 12;   /* vector uint */
 
+   ASSERT(facing == CELL_FACING_FRONT || facing == CELL_FACING_BACK);
+
    /* offset of quad from start of tile
     * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
     */
@@ -651,7 +1811,12 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    int fbRGBA_reg;  /**< framebuffer's RGBA colors for quad */
    int fbZS_reg;    /**< framebuffer's combined z/stencil values for quad */
 
-   spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      spe_print_code(f, true);
+      spe_indent(f, 8);
+      spe_comment(f, -4, facing == CELL_FACING_FRONT ? "Begin front-facing per-fragment ops": "Begin back-facing per-fragment ops");
+   }
+
    spe_allocate_register(f, x_reg);
    spe_allocate_register(f, y_reg);
    spe_allocate_register(f, color_tile_reg);
@@ -674,8 +1839,9 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
 
       ASSERT(TILE_SIZE == 32);
 
-      spe_rotmi(f, x2_reg, x_reg, -1);  /* x2 = x / 2 */
+      spe_comment(f, 0, "Compute quad offset within tile");
       spe_rotmi(f, y2_reg, y_reg, -1);  /* y2 = y / 2 */
+      spe_rotmi(f, x2_reg, x_reg, -1);  /* x2 = x / 2 */
       spe_shli(f, y2_reg, y2_reg, 4);   /* y2 *= 16 */
       spe_a(f, quad_offset_reg, y2_reg, x2_reg);  /* offset = y2 + x2 */
       spe_shli(f, quad_offset_reg, quad_offset_reg, 4);   /* offset *= 16 */
@@ -684,139 +1850,196 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       spe_release_register(f, y2_reg);
    }
 
-
+   /* Generate the alpha test, if needed. */
    if (dsa->alpha.enabled) {
       gen_alpha_test(dsa, f, mask_reg, fragA_reg);
    }
 
+   /* If we need the stencil buffers (because one- or two-sided stencil is
+    * enabled) or the depth buffer (because the depth test is enabled),
+    * go grab them.  Note that if either one- or two-sided stencil is
+    * enabled, dsa->stencil[0].enabled will be true.
+    */
    if (dsa->depth.enabled || dsa->stencil[0].enabled) {
       const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
       boolean write_depth_stencil;
 
-      int fbZ_reg = spe_allocate_available_register(f); /* Z values */
-      int fbS_reg = spe_allocate_available_register(f); /* Stencil values */
+      /* We may or may not need to allocate a register for Z or stencil values */
+      boolean fbS_reg_set = false, fbZ_reg_set = false;
+      unsigned int fbS_reg, fbZ_reg = 0;
+
+      spe_comment(f, 0, "Fetching Z/stencil quad from tile");
 
       /* fetch quad of depth/stencil values from tile at (x,y) */
       /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+      /* XXX Not sure this is allowed if we've only got a 16-bit Z buffer... */
       spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
 
-      if (dsa->depth.enabled) {
-         /* Extract Z bits from fbZS_reg into fbZ_reg */
-         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            int mask_reg = spe_allocate_available_register(f);
-            spe_fsmbi(f, mask_reg, 0x7777);  /* mask[0,1,2,3] = 0x00ffffff */
-            spe_and(f, fbZ_reg, fbZS_reg, mask_reg);  /* fbZ = fbZS & mask */
-            spe_release_register(f, mask_reg);
-            /* OK, fbZ_reg has four 24-bit Z values now */
-         }
-         else {
-            /* XXX handle other z/stencil formats */
-            ASSERT(0);
-         }
+      /* From the Z/stencil buffer format, pull out the bits we need for
+       * Z and/or stencil.  We'll also convert the incoming fragment Z
+       * value in fragZ_reg from a floating point value in [0.0..1.0] to
+       * an unsigned integer value with the appropriate resolution.
+       * Note that even if depth or stencil is *not* enabled, if it's
+       * present in the buffer, we pull it out and put it back later;
+       * otherwise, we can inadvertently destroy the contents of
+       * buffers we're not supposed to touch (e.g., if the user is
+       * clearing the depth buffer but not the stencil buffer, a
+       * quad of constant depth is drawn over the surface; the stencil
+       * buffer must be maintained).
+       */
+      switch(zs_format) {
 
-         /* Convert fragZ values from float[4] to uint[4] */
-         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_X8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_Z24S8_UNORM ||
-             zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            /* 24-bit Z values */
-            int scale_reg = spe_allocate_available_register(f);
+         case PIPE_FORMAT_S8Z24_UNORM: /* fall through */
+         case PIPE_FORMAT_X8Z24_UNORM:
+            /* Pull out both Z and stencil */
+            setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+            setup_optional_register(f, &fbS_reg_set, &fbS_reg);
 
-            /* scale_reg[0,1,2,3] = float(2^24-1) */
-            spe_load_float(f, scale_reg, (float) 0xffffff);
+            /* four 24-bit Z values in the low-order bits */
+            spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff);
 
-            /* XXX these two instructions might be combined */
-            spe_fm(f, fragZ_reg, fragZ_reg, scale_reg); /* fragZ *= scale */
-            spe_cfltu(f, fragZ_reg, fragZ_reg, 0);  /* fragZ = (int) fragZ */
+            /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+             * to a 24-bit unsigned integer
+             */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+            spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
 
-            spe_release_register(f, scale_reg);
-         }
-         else {
-            /* XXX handle 16-bit Z format */
-            ASSERT(0);
-         }
-      }
+            /* four 8-bit stencil values in the high-order bits */
+            spe_rotmi(f, fbS_reg, fbZS_reg, -24);
+         break;
 
-      if (dsa->stencil[0].enabled) {
-         /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */
-         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            /* XXX extract with a shift */
-            ASSERT(0);
-         }
-         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
-                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            /* XXX extract with a mask */
-            ASSERT(0);
-         }
-      }
+         case PIPE_FORMAT_Z24S8_UNORM: /* fall through */
+         case PIPE_FORMAT_Z24X8_UNORM:
+            setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+            setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+
+            /* shift by 8 to get the upper 24-bit values */
+            spe_rotmi(f, fbS_reg, fbZS_reg, -8);
+
+            /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+             * to a 24-bit unsigned integer
+             */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+            spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+
+            /* 8-bit stencil in the low-order bits - mask them out */
+            spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
+         break;
+
+         case PIPE_FORMAT_Z32_UNORM:
+            setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+            /* Copy over 4 32-bit values */
+            spe_move(f, fbZ_reg, fbZS_reg);
+
+            /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+             * to a 32-bit unsigned integer
+             */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+            /* No stencil, so can't do anything there */
+         break;
 
+         case PIPE_FORMAT_Z16_UNORM:
+            /* XXX Not sure this is correct, but it was here before, so we're
+             * going with it for now
+             */
+            setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+            /* Copy over 4 32-bit values */
+            spe_move(f, fbZ_reg, fbZS_reg);
+
+            /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+             * to a 16-bit unsigned integer
+             */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+            spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
+            /* No stencil */
+
+         default:
+            ASSERT(0); /* invalid format */
+      }
 
+      /* If stencil is enabled, use the stencil-specific code
+       * generator to generate both the stencil and depth (if needed)
+       * tests.  Otherwise, if only depth is enabled, generate
+       * a quick depth test.  The test generators themselves will
+       * report back whether the depth/stencil buffer has to be
+       * written back.
+       */
       if (dsa->stencil[0].enabled) {
-         /* XXX this may involve depth testing too */
-         // gen_stencil_test(dsa, f, ... );
-         ASSERT(0);
+         /* This will perform the stencil and depth tests, and update
+          * the mask_reg, fbZ_reg, and fbS_reg as required by the
+          * tests.
+          */
+         ASSERT(fbS_reg_set);
+         spe_comment(f, 0, "Perform stencil test");
+
+         /* Note that fbZ_reg may not be set on entry, if stenciling
+          * is enabled but there's no Z-buffer.  The 
+          * gen_stencil_depth_test() function must ignore the
+          * fbZ_reg register if depth is not enabled.
+          */
+         write_depth_stencil = gen_stencil_depth_test(f, dsa, facing, mask_reg, fragZ_reg, fbZ_reg, fbS_reg);
       }
       else if (dsa->depth.enabled) {
          int zmask_reg = spe_allocate_available_register(f);
-         gen_depth_test(dsa, f, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+         ASSERT(fbZ_reg_set);
+         spe_comment(f, 0, "Perform depth test");
+         write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
          spe_release_register(f, zmask_reg);
       }
-
-      /* do we need to write Z and/or Stencil back into framebuffer? */
-      write_depth_stencil = (dsa->depth.writemask |
-                             dsa->stencil[0].write_mask |
-                             dsa->stencil[1].write_mask);
+      else {
+         write_depth_stencil = false;
+      }
 
       if (write_depth_stencil) {
          /* Merge latest Z and Stencil values into fbZS_reg.
           * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
           * fbS_reg has four 8-bit Z values in bits [7..0].
           */
+         spe_comment(f, 0, "Store quad's depth/stencil values in tile");
          if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
              zs_format == PIPE_FORMAT_X8Z24_UNORM) {
             spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
             spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
          }
-         else if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-                  zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            /* XXX to do */
-            ASSERT(0);
+         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
+                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
+            spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
+            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+         }
+         else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
+            spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
          }
          else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
-            /* XXX to do */
-            ASSERT(0);
+            spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
          }
          else if (zs_format == PIPE_FORMAT_S8_UNORM) {
-            /* XXX to do */
-            ASSERT(0);
+            ASSERT(0);   /* XXX to do */
          }
          else {
-            /* bad zs_format */
-            ASSERT(0);
+            ASSERT(0); /* bad zs_format */
          }
 
          /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
          spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
       }
 
-      spe_release_register(f, fbZ_reg);
-      spe_release_register(f, fbS_reg);
+      /* Don't need these any more */
+      release_optional_register(f, &fbZ_reg_set, fbZ_reg);
+      release_optional_register(f, &fbS_reg_set, fbS_reg);
    }
 
-
    /* Get framebuffer quad/colors.  We'll need these for blending,
     * color masking, and to obey the quad/pixel mask.
     * Load: fbRGBA_reg = memory[color_tile + quad_offset]
     * Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking
     * we could skip this load.
     */
+   spe_comment(f, 0, "Fetch quad colors from tile");
    spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
 
-
    if (blend->blend_enable) {
-      gen_blend(blend, f, color_format,
+      spe_comment(f, 0, "Perform blending");
+      gen_blend(blend, blend_color, f, color_format,
                 fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
    }
 
@@ -829,19 +2052,21 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       int rgba_reg = spe_allocate_available_register(f);
 
       /* Pack four float colors as four 32-bit int colors */
+      spe_comment(f, 0, "Convert float quad colors to packed int framebuffer colors");
       gen_pack_colors(f, color_format,
                       fragR_reg, fragG_reg, fragB_reg, fragA_reg,
                       rgba_reg);
 
       if (blend->logicop_enable) {
+         spe_comment(f, 0, "Compute logic op");
          gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
       }
 
-      if (blend->colormask != 0xf) {
-         gen_colormask(blend->colormask, f, rgba_reg, fbRGBA_reg);
+      if (blend->colormask != PIPE_MASK_RGBA) {
+         spe_comment(f, 0, "Compute color mask");
+         gen_colormask(f, blend->colormask, color_format, rgba_reg, fbRGBA_reg);
       }
 
-
       /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
        * if (mask[i])
        *    rgba[i] = rgba[i];
@@ -853,6 +2078,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       /* Store updated quad in tile:
        * memory[color_tile + quad_offset] = rgba_reg;
        */
+      spe_comment(f, 0, "Store quad colors into color tile");
       spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
 
       spe_release_register(f, rgba_reg);
@@ -862,9 +2088,14 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
 
    spe_bi(f, SPE_REG_RA, 0, 0);  /* return from function call */
 
-
    spe_release_register(f, fbRGBA_reg);
    spe_release_register(f, fbZS_reg);
    spe_release_register(f, quad_offset_reg);
-}
 
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      char buffer[1024];
+      sprintf(buffer, "End %s-facing per-fragment ops: %d instructions", 
+         facing == CELL_FACING_FRONT ? "front" : "back", f->num_inst);
+      spe_comment(f, -4, buffer);
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
index b59de198dce..21b35d1fafe 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
@@ -31,7 +31,7 @@
 
 
 extern void
-cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f);
+cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct spe_function *f);
 
 
 #endif /* CELL_GEN_FRAGMENT_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index 475c6ef0ce6..81efd137c73 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -35,9 +35,9 @@
 #include "draw/draw_context.h"
 #include "cell_context.h"
 #include "cell_flush.h"
+#include "cell_pipe_state.h"
 #include "cell_state.h"
 #include "cell_texture.h"
-#include "cell_state_per_fragment.h"
 
 
 
@@ -45,24 +45,18 @@ static void *
 cell_create_blend_state(struct pipe_context *pipe,
                         const struct pipe_blend_state *blend)
 {
-   struct cell_blend_state *cb = MALLOC(sizeof(struct cell_blend_state));
-
-   (void) memcpy(cb, blend, sizeof(*blend));
-#if 0
-   cell_generate_alpha_blend(cb);
-#endif
-   return cb;
+   return mem_dup(blend, sizeof(*blend));
 }
 
 
 static void
-cell_bind_blend_state(struct pipe_context *pipe, void *state)
+cell_bind_blend_state(struct pipe_context *pipe, void *blend)
 {
    struct cell_context *cell = cell_context(pipe);
 
    draw_flush(cell->draw);
 
-   cell->blend = (struct cell_blend_state *) state;
+   cell->blend = (struct pipe_blend_state *) blend;
    cell->dirty |= CELL_NEW_BLEND;
 }
 
@@ -70,10 +64,7 @@ cell_bind_blend_state(struct pipe_context *pipe, void *state)
 static void
 cell_delete_blend_state(struct pipe_context *pipe, void *blend)
 {
-   struct cell_blend_state *cb = (struct cell_blend_state *) blend;
-
-   spe_release_func(& cb->code);
-   FREE(cb);
+   FREE(blend);
 }
 
 
@@ -95,41 +86,29 @@ cell_set_blend_color(struct pipe_context *pipe,
 
 static void *
 cell_create_depth_stencil_alpha_state(struct pipe_context *pipe,
-                 const struct pipe_depth_stencil_alpha_state *depth_stencil)
+                 const struct pipe_depth_stencil_alpha_state *dsa)
 {
-   struct cell_depth_stencil_alpha_state *cdsa =
-       MALLOC(sizeof(struct cell_depth_stencil_alpha_state));
-
-   (void) memcpy(cdsa, depth_stencil, sizeof(*depth_stencil));
-#if 0
-   cell_generate_depth_stencil_test(cdsa);
-#endif
-   return cdsa;
+   return mem_dup(dsa, sizeof(*dsa));
 }
 
 
 static void
 cell_bind_depth_stencil_alpha_state(struct pipe_context *pipe,
-                                    void *depth_stencil)
+                                    void *dsa)
 {
    struct cell_context *cell = cell_context(pipe);
 
    draw_flush(cell->draw);
 
-   cell->depth_stencil =
-       (struct cell_depth_stencil_alpha_state *) depth_stencil;
+   cell->depth_stencil = (struct pipe_depth_stencil_alpha_state *) dsa;
    cell->dirty |= CELL_NEW_DEPTH_STENCIL;
 }
 
 
 static void
-cell_delete_depth_stencil_alpha_state(struct pipe_context *pipe, void *depth)
+cell_delete_depth_stencil_alpha_state(struct pipe_context *pipe, void *dsa)
 {
-   struct cell_depth_stencil_alpha_state *cdsa =
-       (struct cell_depth_stencil_alpha_state *) depth;
-
-   spe_release_func(& cdsa->code);
-   FREE(cdsa);
+   FREE(dsa);
 }
 
 
@@ -191,24 +170,23 @@ cell_set_polygon_stipple( struct pipe_context *pipe,
 
 static void *
 cell_create_rasterizer_state(struct pipe_context *pipe,
-                             const struct pipe_rasterizer_state *setup)
+                             const struct pipe_rasterizer_state *rasterizer)
 {
-   struct pipe_rasterizer_state *state
-      = MALLOC(sizeof(struct pipe_rasterizer_state));
-   memcpy(state, setup, sizeof(struct pipe_rasterizer_state));
-   return state;
+   return mem_dup(rasterizer, sizeof(*rasterizer));
 }
 
 
 static void
-cell_bind_rasterizer_state(struct pipe_context *pipe, void *setup)
+cell_bind_rasterizer_state(struct pipe_context *pipe, void *rast)
 {
+   struct pipe_rasterizer_state *rasterizer =
+      (struct pipe_rasterizer_state *) rast;
    struct cell_context *cell = cell_context(pipe);
 
    /* pass-through to draw module */
-   draw_set_rasterizer_state(cell->draw, setup);
+   draw_set_rasterizer_state(cell->draw, rasterizer);
 
-   cell->rasterizer = (struct pipe_rasterizer_state *)setup;
+   cell->rasterizer = rasterizer;
 
    cell->dirty |= CELL_NEW_RASTERIZER;
 }
@@ -235,17 +213,24 @@ cell_bind_sampler_states(struct pipe_context *pipe,
                          unsigned num, void **samplers)
 {
    struct cell_context *cell = cell_context(pipe);
+   uint i, changed = 0x0;
 
    assert(num <= CELL_MAX_SAMPLERS);
 
    draw_flush(cell->draw);
 
-   memcpy(cell->sampler, samplers, num * sizeof(void *));
-   memset(&cell->sampler[num], 0, (CELL_MAX_SAMPLERS - num) *
-          sizeof(void *));
-   cell->num_samplers = num;
+   for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
+      struct pipe_sampler_state *new_samp = i < num ? samplers[i] : NULL;
+      if (cell->sampler[i] != new_samp) {
+         cell->sampler[i] = new_samp;
+         changed |= (1 << i);
+      }
+   }
 
-   cell->dirty |= CELL_NEW_SAMPLER;
+   if (changed) {
+      cell->dirty |= CELL_NEW_SAMPLER;
+      cell->dirty_samplers |= changed;
+   }
 }
 
 
@@ -263,27 +248,25 @@ cell_set_sampler_textures(struct pipe_context *pipe,
                           unsigned num, struct pipe_texture **texture)
 {
    struct cell_context *cell = cell_context(pipe);
-   uint i;
+   uint i, changed = 0x0;
 
    assert(num <= CELL_MAX_SAMPLERS);
 
-   /* Check for no-op */
-   if (num == cell->num_textures &&
-       !memcmp(cell->texture, texture, num * sizeof(struct pipe_texture *)))
-      return;
-
-   draw_flush(cell->draw);
-
    for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
-      struct pipe_texture *tex = i < num ? texture[i] : NULL;
-
-      pipe_texture_reference((struct pipe_texture **) &cell->texture[i], tex);
+      struct pipe_texture *new_tex = i < num ? texture[i] : NULL;
+      if ((struct pipe_texture *) cell->texture[i] != new_tex) {
+         pipe_texture_reference((struct pipe_texture **) &cell->texture[i],
+                                new_tex);
+         changed |= (1 << i);
+      }
    }
-   cell->num_textures = num;
 
-   cell_update_texture_mapping(cell);
+   cell->num_textures = num;
 
-   cell->dirty |= CELL_NEW_TEXTURE;
+   if (changed) {
+      cell->dirty |= CELL_NEW_TEXTURE;
+      cell->dirty_textures |= changed;
+   }
 }
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_render.c b/src/gallium/drivers/cell/ppu/cell_render.c
index dd25ae880e5..79cb8df82fa 100644
--- a/src/gallium/drivers/cell/ppu/cell_render.c
+++ b/src/gallium/drivers/cell/ppu/cell_render.c
@@ -152,6 +152,7 @@ cell_flush_prim_buffer(struct cell_context *cell)
       struct cell_command_render *render = &cell_global.command[i].render;
       render->prim_type = PIPE_PRIM_TRIANGLES;
       render->num_verts = cell->prim_buffer.num_verts;
+      render->front_winding = cell->rasterizer->front_winding;
       render->vertex_size = cell->vertex_info->size * 4;
       render->xmin = cell->prim_buffer.xmin;
       render->ymin = cell->prim_buffer.ymin;
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c
index 139b3719b62..d2235579507 100644
--- a/src/gallium/drivers/cell/ppu/cell_screen.c
+++ b/src/gallium/drivers/cell/ppu/cell_screen.c
@@ -58,9 +58,9 @@ cell_get_param(struct pipe_screen *screen, int param)
    case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
       return CELL_MAX_SAMPLERS;
    case PIPE_CAP_NPOT_TEXTURES:
-      return 0;
+      return 1;
    case PIPE_CAP_TWO_SIDED_STENCIL:
-      return 0;
+      return 1;
    case PIPE_CAP_GLSL:
       return 1;
    case PIPE_CAP_S3TC:
@@ -68,21 +68,21 @@ cell_get_param(struct pipe_screen *screen, int param)
    case PIPE_CAP_ANISOTROPIC_FILTER:
       return 0;
    case PIPE_CAP_POINT_SPRITE:
-      return 0;
+      return 1;
    case PIPE_CAP_MAX_RENDER_TARGETS:
       return 1;
    case PIPE_CAP_OCCLUSION_QUERY:
-      return 0;
+      return 1;
    case PIPE_CAP_TEXTURE_SHADOW_MAP:
-      return 0;
+      return 10;
    case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
-      return 12; /* max 2Kx2K */
+      return CELL_MAX_TEXTURE_LEVELS;
    case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
       return 8;  /* max 128x128x128 */
    case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
-      return 12; /* max 2Kx2K */
+      return CELL_MAX_TEXTURE_LEVELS;
    default:
-      return 0;
+      return 10;
    }
 }
 
@@ -108,7 +108,7 @@ cell_get_paramf(struct pipe_screen *screen, int param)
       return 16.0; /* arbitrary */
 
    default:
-      return 0;
+      return 10;
    }
 }
 
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c
index 9508227e298..28e5e6d706d 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.c
+++ b/src/gallium/drivers/cell/ppu/cell_spu.c
@@ -36,6 +36,7 @@
 #include "cell_spu.h"
 #include "pipe/p_format.h"
 #include "pipe/p_state.h"
+#include "util/u_memory.h"
 #include "cell/common.h"
 
 
@@ -52,6 +53,35 @@ struct cell_global_info cell_global;
 
 
 /**
+ * Scan /proc/cpuinfo to determine the timebase for the system.
+ * This is used by the SPUs to convert 'decrementer' ticks to seconds.
+ * There may be a better way to get this value...
+ */
+static unsigned
+get_timebase(void)
+{
+   FILE *f = fopen("/proc/cpuinfo", "r");
+   unsigned timebase;
+
+   assert(f);
+   while (!feof(f)) {
+      char line[80];
+      fgets(line, sizeof(line), f);
+      if (strncmp(line, "timebase", 8) == 0) {
+         char *colon = strchr(line, ':');
+         if (colon) {
+            timebase = atoi(colon + 2);
+            break;
+         }
+      }
+   }
+   fclose(f);
+
+   return timebase;
+}
+
+
+/**
  * Write a 1-word message to the given SPE mailbox.
  */
 void
@@ -114,6 +144,7 @@ cell_start_spus(struct cell_context *cell)
 {
    static boolean one_time_init = FALSE;
    uint i, j;
+   uint timebase = get_timebase();
 
    if (one_time_init) {
       fprintf(stderr, "PPU: Multiple rendering contexts not yet supported "
@@ -123,24 +154,29 @@ cell_start_spus(struct cell_context *cell)
 
    one_time_init = TRUE;
 
-   assert(cell->num_spus <= MAX_SPUS);
-
-   ASSERT_ALIGN16(&cell_global.command[0]);
-   ASSERT_ALIGN16(&cell_global.command[1]);
+   assert(cell->num_spus <= CELL_MAX_SPUS);
 
    ASSERT_ALIGN16(&cell_global.inits[0]);
    ASSERT_ALIGN16(&cell_global.inits[1]);
 
+   /*
+    * Initialize the global 'inits' structure for each SPU.
+    * A pointer to the init struct will be passed to each SPU.
+    * The SPUs will then each grab their init info with mfc_get().
+    */
    for (i = 0; i < cell->num_spus; i++) {
       cell_global.inits[i].id = i;
       cell_global.inits[i].num_spus = cell->num_spus;
       cell_global.inits[i].debug_flags = cell->debug_flags;
-      cell_global.inits[i].cmd = &cell_global.command[i];
+      cell_global.inits[i].inv_timebase = 1000.0f / timebase;
+
       for (j = 0; j < CELL_NUM_BUFFERS; j++) {
          cell_global.inits[i].buffers[j] = cell->buffer[j];
       }
       cell_global.inits[i].buffer_status = &cell->buffer_status[0][0][0];
 
+      cell_global.inits[i].spu_functions = &cell->spu_functions;
+
       cell_global.spe_contexts[i] = spe_context_create(0, NULL);
       if (!cell_global.spe_contexts[i]) {
          fprintf(stderr, "spe_context_create() failed\n");
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.h b/src/gallium/drivers/cell/ppu/cell_spu.h
index 137f26612e4..c93958a9ed5 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.h
+++ b/src/gallium/drivers/cell/ppu/cell_spu.h
@@ -30,14 +30,12 @@
 
 
 #include <libspe2.h>
-#include <libmisc.h>
+#include <pthread.h>
 #include "cell/common.h"
 
 #include "cell_context.h"
 
 
-#define MAX_SPUS 8
-
 /**
  * Global vars, for now anyway.
  */
@@ -46,14 +44,13 @@ struct cell_global_info
    /**
     * SPU/SPE handles, etc
     */
-   spe_context_ptr_t spe_contexts[MAX_SPUS];
-   pthread_t spe_threads[MAX_SPUS];
+   spe_context_ptr_t spe_contexts[CELL_MAX_SPUS];
+   pthread_t spe_threads[CELL_MAX_SPUS];
 
    /**
-    * Data sent to SPUs
+    * Data sent to SPUs at start-up
     */
-   struct cell_init_info inits[MAX_SPUS];
-   struct cell_command command[MAX_SPUS];
+   struct cell_init_info inits[CELL_MAX_SPUS];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state.h b/src/gallium/drivers/cell/ppu/cell_state.h
index a7771a55a31..b193170f9ce 100644
--- a/src/gallium/drivers/cell/ppu/cell_state.h
+++ b/src/gallium/drivers/cell/ppu/cell_state.h
@@ -44,8 +44,9 @@
 #define CELL_NEW_TEXTURE       0x800
 #define CELL_NEW_VERTEX        0x1000
 #define CELL_NEW_VS            0x2000
-#define CELL_NEW_CONSTANTS     0x4000
-#define CELL_NEW_VERTEX_INFO   0x8000
+#define CELL_NEW_VS_CONSTANTS  0x4000
+#define CELL_NEW_FS_CONSTANTS  0x8000
+#define CELL_NEW_VERTEX_INFO   0x10000
 
 
 extern void
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 2da3097983c..0a0af81f53f 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -25,18 +25,155 @@
  * 
  **************************************************************************/
 
+#include "pipe/p_inlines.h"
 #include "util/u_memory.h"
 #include "cell_context.h"
 #include "cell_gen_fragment.h"
 #include "cell_state.h"
 #include "cell_state_emit.h"
-#include "cell_state_per_fragment.h"
 #include "cell_batch.h"
 #include "cell_texture.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 
 
+/**
+ * Find/create a cell_command_fragment_ops object corresponding to the
+ * current blend/stencil/z/colormask/etc. state.
+ */
+static struct cell_command_fragment_ops *
+lookup_fragment_ops(struct cell_context *cell)
+{
+   struct cell_fragment_ops_key key;
+   struct cell_command_fragment_ops *ops;
+
+   /*
+    * Build key
+    */
+   memset(&key, 0, sizeof(key));
+   key.blend = *cell->blend;
+   key.blend_color = cell->blend_color;
+   key.dsa = *cell->depth_stencil;
+
+   if (cell->framebuffer.cbufs[0])
+      key.color_format = cell->framebuffer.cbufs[0]->format;
+   else
+      key.color_format = PIPE_FORMAT_NONE;
+
+   if (cell->framebuffer.zsbuf)
+      key.zs_format = cell->framebuffer.zsbuf->format;
+   else
+      key.zs_format = PIPE_FORMAT_NONE;
+
+   /*
+    * Look up key in cache.
+    */
+   ops = (struct cell_command_fragment_ops *)
+      util_keymap_lookup(cell->fragment_ops_cache, &key);
+
+   /*
+    * If not found, create/save new fragment ops command.
+    */
+   if (!ops) {
+      struct spe_function spe_code_front, spe_code_back;
+      unsigned int facing_dependent, total_code_size;
+
+      if (0)
+         debug_printf("**** Create New Fragment Ops\n");
+
+      /* Prepare the buffer that will hold the generated code.  The
+       * "0" passed in for the size means that the SPE code will
+       * use a default size.
+       */
+      spe_init_func(&spe_code_front, 0);
+      spe_init_func(&spe_code_back, 0);
+
+      /* Generate new code.  Always generate new code for both front-facing
+       * and back-facing fragments, even if it's the same code in both
+       * cases.
+       */
+      cell_gen_fragment_function(cell, CELL_FACING_FRONT, &spe_code_front);
+      cell_gen_fragment_function(cell, CELL_FACING_BACK, &spe_code_back);
+
+      /* Make sure the code is a multiple of 8 bytes long; this is
+       * required to ensure that the dual pipe instruction alignment
+       * is correct.  It's also important for the SPU unpacking,
+       * which assumes 8-byte boundaries.
+       */
+      unsigned int front_code_size = spe_code_size(&spe_code_front);
+      while (front_code_size % 8 != 0) {
+         spe_lnop(&spe_code_front);
+         front_code_size = spe_code_size(&spe_code_front);
+      }
+      unsigned int back_code_size = spe_code_size(&spe_code_back);
+      while (back_code_size % 8 != 0) {
+         spe_lnop(&spe_code_back);
+         back_code_size = spe_code_size(&spe_code_back);
+      }
+
+      /* Determine whether the code we generated is facing-dependent, by
+       * determining whether the generated code is different for the front-
+       * and back-facing fragments.
+       */
+      if (front_code_size == back_code_size && memcmp(spe_code_front.store, spe_code_back.store, front_code_size) == 0) {
+         /* Code is identical; only need one copy. */
+         facing_dependent = 0;
+         total_code_size = front_code_size;
+      }
+      else {
+         /* Code is different for front-facing and back-facing fragments.
+          * Need to send both copies.
+          */
+         facing_dependent = 1;
+         total_code_size = front_code_size + back_code_size;
+      }
+
+      /* alloc new fragment ops command.  Note that this structure
+       * has variant length based on the total code size required.
+       */
+      ops = CALLOC_VARIANT_LENGTH_STRUCT(cell_command_fragment_ops, total_code_size);
+      /* populate the new cell_command_fragment_ops object */
+      ops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
+      ops->total_code_size = total_code_size;
+      ops->front_code_index = 0;
+      memcpy(ops->code, spe_code_front.store, front_code_size);
+      if (facing_dependent) {
+        /* We have separate front- and back-facing code.  Append the
+         * back-facing code to the buffer.  Be careful because the code
+         * size is in bytes, but the buffer is of unsigned elements.
+         */
+        ops->back_code_index = front_code_size / sizeof(spe_code_front.store[0]);
+        memcpy(ops->code + ops->back_code_index, spe_code_back.store, back_code_size);
+      }
+      else {
+        /* Use the same code for front- and back-facing fragments */
+        ops->back_code_index = ops->front_code_index;
+      }
+
+      /* Set the fields for the fallback case.  Note that these fields
+       * (and the whole fallback case) will eventually go away.
+       */
+      ops->dsa = *cell->depth_stencil;
+      ops->blend = *cell->blend;
+      ops->blend_color = cell->blend_color;
+
+      /* insert cell_command_fragment_ops object into keymap/cache */
+      util_keymap_insert(cell->fragment_ops_cache, &key, ops, NULL);
+
+      /* release rtasm buffer */
+      spe_release_func(&spe_code_front);
+      spe_release_func(&spe_code_back);
+   }
+   else {
+      if (0)
+         debug_printf("**** Re-use Fragment Ops\n");
+   }
+
+   return ops;
+}
+
+
+
 static void
 emit_state_cmd(struct cell_context *cell, uint cmd,
                const void *state, uint state_size)
@@ -73,6 +210,13 @@ cell_emit_state(struct cell_context *cell)
 #endif
    }
 
+   if (cell->dirty & (CELL_NEW_RASTERIZER)) {
+      struct cell_command_rasterizer *rast =
+         cell_batch_alloc(cell, sizeof(*rast));
+      rast->opcode = CELL_CMD_STATE_RASTERIZER;
+      rast->rasterizer = *cell->rasterizer;
+   }
+
    if (cell->dirty & (CELL_NEW_FS)) {
       /* Send new fragment program to SPUs */
       struct cell_command_fragment_program *fp
@@ -90,59 +234,81 @@ cell_emit_state(struct cell_context *cell)
       }
    }
 
+   if (cell->dirty & (CELL_NEW_FS_CONSTANTS)) {
+      const uint shader = PIPE_SHADER_FRAGMENT;
+      const uint num_const = cell->constants[shader].size / sizeof(float);
+      uint i, j;
+      float *buf = cell_batch_alloc(cell, 16 + num_const * sizeof(float));
+      uint64_t *ibuf = (uint64_t *) buf;
+      const float *constants = pipe_buffer_map(cell->pipe.screen,
+                                               cell->constants[shader].buffer,
+                                               PIPE_BUFFER_USAGE_CPU_READ);
+      ibuf[0] = CELL_CMD_STATE_FS_CONSTANTS;
+      ibuf[1] = num_const;
+      j = 4;
+      for (i = 0; i < num_const; i++) {
+         buf[j++] = constants[i];
+      }
+      pipe_buffer_unmap(cell->pipe.screen, cell->constants[shader].buffer);
+   }
+
    if (cell->dirty & (CELL_NEW_FRAMEBUFFER |
                       CELL_NEW_DEPTH_STENCIL |
                       CELL_NEW_BLEND)) {
-      /* XXX we don't want to always do codegen here.  We should have
-       * a hash/lookup table to cache previous results...
-       */
-      struct cell_command_fragment_ops *fops
-            = cell_batch_alloc(cell, sizeof(*fops));
-      struct spe_function spe_code;
-
-      /* generate new code */
-      cell_gen_fragment_function(cell, &spe_code);
-      /* put the new code into the batch buffer */
-      fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
-      memcpy(&fops->code, spe_code.store,
-             SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
-      fops->dsa = cell->depth_stencil->base;
-      fops->blend = cell->blend->base;
-      /* free codegen buffer */
-      spe_release_func(&spe_code);
+      struct cell_command_fragment_ops *fops, *fops_cmd;
+      /* Note that cell_command_fragment_ops is a variant-sized record */
+      fops = lookup_fragment_ops(cell);
+      fops_cmd = cell_batch_alloc(cell, sizeof(*fops_cmd) + fops->total_code_size);
+      memcpy(fops_cmd, fops, sizeof(*fops) + fops->total_code_size);
    }
 
    if (cell->dirty & CELL_NEW_SAMPLER) {
       uint i;
       for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
-         if (cell->sampler[i]) {
-            struct cell_command_sampler *sampler
-               = cell_batch_alloc(cell, sizeof(*sampler));
-            sampler->opcode = CELL_CMD_STATE_SAMPLER;
-            sampler->unit = i;
-            sampler->state = *cell->sampler[i];
+         if (cell->dirty_samplers & (1 << i)) {
+            if (cell->sampler[i]) {
+               struct cell_command_sampler *sampler
+                  = cell_batch_alloc(cell, sizeof(*sampler));
+               sampler->opcode = CELL_CMD_STATE_SAMPLER;
+               sampler->unit = i;
+               sampler->state = *cell->sampler[i];
+            }
          }
       }
+      cell->dirty_samplers = 0x0;
    }
 
    if (cell->dirty & CELL_NEW_TEXTURE) {
       uint i;
       for (i = 0;i < CELL_MAX_SAMPLERS; i++) {
-         struct cell_command_texture *texture
-            =  cell_batch_alloc(cell, sizeof(*texture));
-         texture->opcode = CELL_CMD_STATE_TEXTURE;
-         texture->unit = i;
-         if (cell->texture[i]) {
-            texture->start = cell->texture[i]->tiled_data;
-            texture->width = cell->texture[i]->base.width[0];
-            texture->height = cell->texture[i]->base.height[0];
-         }
-         else {
-            texture->start = NULL;
-            texture->width = 1;
-            texture->height = 1;
+         if (cell->dirty_textures & (1 << i)) {
+            struct cell_command_texture *texture
+               =  cell_batch_alloc(cell, sizeof(*texture));
+            texture->opcode = CELL_CMD_STATE_TEXTURE;
+            texture->unit = i;
+            if (cell->texture[i]) {
+               uint level;
+               for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+                  texture->start[level] = cell->texture[i]->tiled_mapped[level];
+                  texture->width[level] = cell->texture[i]->base.width[level];
+                  texture->height[level] = cell->texture[i]->base.height[level];
+                  texture->depth[level] = cell->texture[i]->base.depth[level];
+               }
+               texture->target = cell->texture[i]->base.target;
+            }
+            else {
+               uint level;
+               for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+                  texture->start[level] = NULL;
+                  texture->width[level] = 0;
+                  texture->height[level] = 0;
+                  texture->depth[level] = 0;
+               }
+               texture->target = 0;
+            }
          }
       }
+      cell->dirty_textures = 0x0;
    }
 
    if (cell->dirty & CELL_NEW_VERTEX_INFO) {
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
index 3a0d066da2a..cda39f8d592 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -191,13 +191,18 @@ cell_set_constant_buffer(struct pipe_context *pipe,
    assert(shader < PIPE_SHADER_TYPES);
    assert(index == 0);
 
+   draw_flush(cell->draw);
+
    /* note: reference counting */
    winsys_buffer_reference(ws,
                         &cell->constants[shader].buffer,
                         buf->buffer);
    cell->constants[shader].size = buf->size;
 
-   cell->dirty |= CELL_NEW_CONSTANTS;
+   if (shader == PIPE_SHADER_VERTEX)
+      cell->dirty |= CELL_NEW_VS_CONSTANTS;
+   else if (shader == PIPE_SHADER_FRAGMENT)
+      cell->dirty |= CELL_NEW_FS_CONSTANTS;
 }
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_surface.c b/src/gallium/drivers/cell/ppu/cell_surface.c
index 732c64082ef..c9203fee087 100644
--- a/src/gallium/drivers/cell/ppu/cell_surface.c
+++ b/src/gallium/drivers/cell/ppu/cell_surface.c
@@ -27,6 +27,7 @@
 
 #include "util/u_rect.h"
 #include "cell_context.h"
+#include "cell_surface.h"
 
 
 void
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index b6590dfb86e..9f83ab8fa4c 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -28,6 +28,7 @@
   * Authors:
   *   Keith Whitwell <[email protected]>
   *   Michel Dänzer <[email protected]>
+  *   Brian Paul
   */
 
 #include "pipe/p_context.h"
@@ -42,30 +43,31 @@
 #include "cell_texture.h"
 
 
-/* Simple, maximally packed layout.
- */
 
-static unsigned minify( unsigned d )
+static unsigned
+minify(unsigned d)
 {
    return MAX2(1, d>>1);
 }
 
 
 static void
-cell_texture_layout(struct cell_texture * spt)
+cell_texture_layout(struct cell_texture *ct)
 {
-   struct pipe_texture *pt = &spt->base;
+   struct pipe_texture *pt = &ct->base;
    unsigned level;
    unsigned width = pt->width[0];
    unsigned height = pt->height[0];
    unsigned depth = pt->depth[0];
 
-   spt->buffer_size = 0;
+   ct->buffer_size = 0;
 
    for ( level = 0 ; level <= pt->last_level ; level++ ) {
       unsigned size;
       unsigned w_tile, h_tile;
 
+      assert(level < CELL_MAX_TEXTURE_LEVELS);
+
       /* width, height, rounded up to tile size */
       w_tile = align(width, TILE_SIZE);
       h_tile = align(height, TILE_SIZE);
@@ -76,9 +78,9 @@ cell_texture_layout(struct cell_texture * spt)
       pt->nblocksx[level] = pf_get_nblocksx(&pt->block, w_tile);  
       pt->nblocksy[level] = pf_get_nblocksy(&pt->block, h_tile);  
 
-      spt->stride[level] = pt->nblocksx[level] * pt->block.size;
+      ct->stride[level] = pt->nblocksx[level] * pt->block.size;
 
-      spt->level_offset[level] = spt->buffer_size;
+      ct->level_offset[level] = ct->buffer_size;
 
       size = pt->nblocksx[level] * pt->nblocksy[level] * pt->block.size;
       if (pt->target == PIPE_TEXTURE_CUBE)
@@ -86,7 +88,7 @@ cell_texture_layout(struct cell_texture * spt)
       else
          size *= depth;
 
-      spt->buffer_size += size;
+      ct->buffer_size += size;
 
       width  = minify(width);
       height = minify(height);
@@ -100,26 +102,25 @@ cell_texture_create(struct pipe_screen *screen,
                     const struct pipe_texture *templat)
 {
    struct pipe_winsys *ws = screen->winsys;
-   struct cell_texture *spt = CALLOC_STRUCT(cell_texture);
-   if (!spt)
+   struct cell_texture *ct = CALLOC_STRUCT(cell_texture);
+   if (!ct)
       return NULL;
 
-   spt->base = *templat;
-   spt->base.refcount = 1;
-   spt->base.screen = screen;
+   ct->base = *templat;
+   ct->base.refcount = 1;
+   ct->base.screen = screen;
 
-   cell_texture_layout(spt);
+   cell_texture_layout(ct);
 
-   spt->buffer = ws->buffer_create(ws, 32,
-                                   PIPE_BUFFER_USAGE_PIXEL,
-                                   spt->buffer_size);
+   ct->buffer = ws->buffer_create(ws, 32, PIPE_BUFFER_USAGE_PIXEL,
+                                  ct->buffer_size);
 
-   if (!spt->buffer) {
-      FREE(spt);
+   if (!ct->buffer) {
+      FREE(ct);
       return NULL;
    }
 
-   return &spt->base;
+   return &ct->base;
 }
 
 
@@ -135,177 +136,310 @@ cell_texture_release(struct pipe_screen *screen,
        __FUNCTION__, (void *) *pt, (*pt)->refcount - 1);
    */
    if (--(*pt)->refcount <= 0) {
-      struct cell_texture *spt = cell_texture(*pt);
+      /* Delete this texture now.
+       * But note that the underlying pipe_buffer may linger...
+       */
+      struct cell_texture *ct = cell_texture(*pt);
+      uint i;
 
       /*
-      DBG("%s deleting %p\n", __FUNCTION__, (void *) spt);
+      DBG("%s deleting %p\n", __FUNCTION__, (void *) ct);
       */
 
-      pipe_buffer_reference(screen, &spt->buffer, NULL);
+      pipe_buffer_reference(screen, &ct->buffer, NULL);
 
-      FREE(spt);
+      for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+         /* Unreference the tiled image buffer.
+          * It may not actually be deleted until a fence is hit.
+          */
+         if (ct->tiled_buffer[i]) {
+            ct->tiled_mapped[i] = NULL;
+            winsys_buffer_reference(screen->winsys, &ct->tiled_buffer[i], NULL);
+         }
+      }
+
+      FREE(ct);
    }
    *pt = NULL;
 }
 
 
-#if 0
+
+/**
+ * Convert image from linear layout to tiled layout.  4-byte pixels.
+ */
 static void
-cell_texture_update(struct pipe_context *pipe, struct pipe_texture *texture,
-                    uint face, uint levelsMask)
+twiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
+                   uint src_stride, const uint *src)
 {
-   /* XXX TO DO:  re-tile the texture data ... */
+   const uint tile_size2 = tile_size * tile_size;
+   const uint h_t = (h + tile_size - 1) / tile_size;
+   const uint w_t = (w + tile_size - 1) / tile_size;
 
-}
-#endif
+   uint it, jt;  /* tile counters */
+   uint i, j;    /* intra-tile counters */
 
+   src_stride /= 4; /* convert from bytes to pixels */
 
-static struct pipe_surface *
-cell_get_tex_surface(struct pipe_screen *screen,
-                     struct pipe_texture *pt,
-                     unsigned face, unsigned level, unsigned zslice,
-                     unsigned usage)
-{
-   struct pipe_winsys *ws = screen->winsys;
-   struct cell_texture *spt = cell_texture(pt);
-   struct pipe_surface *ps;
+   /* loop over dest tiles */
+   for (it = 0; it < h_t; it++) {
+      for (jt = 0; jt < w_t; jt++) {
+         /* start of dest tile: */
+         uint *tdst = dst + (it * w_t + jt) * tile_size2;
 
-   ps = ws->surface_alloc(ws);
-   if (ps) {
-      assert(ps->refcount);
-      assert(ps->winsys);
-      winsys_buffer_reference(ws, &ps->buffer, spt->buffer);
-      ps->format = pt->format;
-      ps->block = pt->block;
-      ps->width = pt->width[level];
-      ps->height = pt->height[level];
-      ps->nblocksx = pt->nblocksx[level];
-      ps->nblocksy = pt->nblocksy[level];
-      ps->stride = spt->stride[level];
-      ps->offset = spt->level_offset[level];
-      ps->usage = usage;
+         /* compute size of this tile (may be smaller than tile_size) */
+         /* XXX note: a compiler bug was found here. That's why the code
+          * looks as it does.
+          */
+         uint tile_width = w - jt * tile_size;
+         tile_width = MIN2(tile_width, tile_size);
+         uint tile_height = h - it * tile_size;
+         tile_height = MIN2(tile_height, tile_size);
 
-      /* XXX may need to override usage flags (see sp_texture.c) */
+         /* loop over texels in the tile */
+         for (i = 0; i < tile_height; i++) {
+            for (j = 0; j < tile_width; j++) {
+               const uint srci = it * tile_size + i;
+               const uint srcj = jt * tile_size + j;
+               ASSERT(srci < h);
+               ASSERT(srcj < w);
+               tdst[i * tile_size + j] = src[srci * src_stride + srcj];
+            }
+         }
+      }
+   }
+}
 
-      pipe_texture_reference(&ps->texture, pt); 
-      ps->face = face;
-      ps->level = level;
-      ps->zslice = zslice;
 
-      if (pt->target == PIPE_TEXTURE_CUBE || pt->target == PIPE_TEXTURE_3D) {
-	 ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) *
-		       ps->nblocksy *
-		       ps->stride;
-      }
-      else {
-	 assert(face == 0);
-	 assert(zslice == 0);
+/**
+ * For Cell.  Basically, rearrange the pixels/quads from this layout:
+ *  +--+--+--+--+
+ *  |p0|p1|p2|p3|....
+ *  +--+--+--+--+
+ *
+ * to this layout:
+ *  +--+--+
+ *  |p0|p1|....
+ *  +--+--+
+ *  |p2|p3|
+ *  +--+--+
+ */
+static void
+twiddle_tile(const uint *tileIn, uint *tileOut)
+{
+   int y, x;
+
+   for (y = 0; y < TILE_SIZE; y+=2) {
+      for (x = 0; x < TILE_SIZE; x+=2) {
+         int k = 4 * (y/2 * TILE_SIZE/2 + x/2);
+         tileOut[y * TILE_SIZE + (x + 0)] = tileIn[k];
+         tileOut[y * TILE_SIZE + (x + 1)] = tileIn[k+1];
+         tileOut[(y + 1) * TILE_SIZE + (x + 0)] = tileIn[k+2];
+         tileOut[(y + 1) * TILE_SIZE + (x + 1)] = tileIn[k+3];
       }
    }
-   return ps;
 }
 
 
-
 /**
- * Copy tile data from linear layout to tiled layout.
- * XXX this should be rolled into the future surface-creation code.
- * XXX also need "untile" code...
+ * Convert image from tiled layout to linear layout.  4-byte pixels.
  */
 static void
-tile_copy_data(uint w, uint h, uint tile_size, uint *dst, const uint *src)
+untwiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
+                     uint dst_stride, const uint *src)
 {
    const uint tile_size2 = tile_size * tile_size;
-   const uint h_t = h / tile_size, w_t = w / tile_size;
-
+   const uint h_t = (h + tile_size - 1) / tile_size;
+   const uint w_t = (w + tile_size - 1) / tile_size;
+   uint *tile_buf;
    uint it, jt;  /* tile counters */
    uint i, j;    /* intra-tile counters */
 
-   /* loop over dest tiles */
+   dst_stride /= 4; /* convert from bytes to pixels */
+
+   tile_buf = align_malloc(tile_size * tile_size * 4, 16);
+   
+   /* loop over src tiles */
    for (it = 0; it < h_t; it++) {
       for (jt = 0; jt < w_t; jt++) {
-         /* start of dest tile: */
-         uint *tdst = dst + (it * w_t + jt) * tile_size2;
+         /* start of src tile: */
+         const uint *tsrc = src + (it * w_t + jt) * tile_size2;
+         
+         twiddle_tile(tsrc, tile_buf);
+         tsrc = tile_buf;
+
+         /* compute size of this tile (may be smaller than tile_size) */
+         /* XXX note: a compiler bug was found here. That's why the code
+          * looks as it does.
+          */
+         uint tile_width = w - jt * tile_size;
+         tile_width = MIN2(tile_width, tile_size);
+         uint tile_height = h - it * tile_size;
+         tile_height = MIN2(tile_height, tile_size);
+
          /* loop over texels in the tile */
-         for (i = 0; i < tile_size; i++) {
-            for (j = 0; j < tile_size; j++) {
-               const uint srci = it * tile_size + i;
-               const uint srcj = jt * tile_size + j;
-               *tdst++ = src[srci * w + srcj];
+         for (i = 0; i < tile_height; i++) {
+            for (j = 0; j < tile_width; j++) {
+               uint dsti = it * tile_size + i;
+               uint dstj = jt * tile_size + j;
+               ASSERT(dsti < h);
+               ASSERT(dstj < w);
+               dst[dsti * dst_stride + dstj] = tsrc[i * tile_size + j];
             }
          }
       }
    }
-}
 
+   align_free(tile_buf);
+}
 
 
 /**
  * Convert linear texture image data to tiled format for SPU usage.
- * XXX recast this in terms of pipe_surfaces (aka texture views).
  */
 static void
-cell_tile_texture(struct cell_context *cell,
-                  struct cell_texture *texture)
+cell_twiddle_texture(struct pipe_screen *screen,
+                     struct pipe_surface *surface)
 {
-   struct pipe_screen *screen = cell->pipe.screen;
-   uint face = 0, level = 0, zslice = 0;
-   struct pipe_surface *surf;
-   const uint w = texture->base.width[0], h = texture->base.height[0];
-   const uint *src;
-
-   /* temporary restrictions: */
-   assert(w >= TILE_SIZE);
-   assert(h >= TILE_SIZE);
-   assert(w % TILE_SIZE == 0);
-   assert(h % TILE_SIZE == 0);
-
-   surf = screen->get_tex_surface(screen, &texture->base, face, level, zslice,
-                                  PIPE_BUFFER_USAGE_CPU_WRITE);
-   ASSERT(surf);
-
-   src = (const uint *) pipe_surface_map(surf, PIPE_BUFFER_USAGE_CPU_WRITE);
-
-   if (texture->tiled_data) {
-      align_free(texture->tiled_data);
+   struct cell_texture *ct = cell_texture(surface->texture);
+   const uint level = surface->level;
+   const uint texWidth = ct->base.width[level];
+   const uint texHeight = ct->base.height[level];
+   const uint bufWidth = align(texWidth, TILE_SIZE);
+   const uint bufHeight = align(texHeight, TILE_SIZE);
+   const void *map = pipe_buffer_map(screen, surface->buffer,
+                                     PIPE_BUFFER_USAGE_CPU_READ);
+   const uint *src = (const uint *) ((const ubyte *) map + surface->offset);
+
+   switch (ct->base.format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+      {
+         int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
+         int offset = bufWidth * bufHeight * 4 * surface->face;
+         uint *dst;
+
+         if (!ct->tiled_buffer[level]) {
+            /* allocate buffer for tiled data now */
+            struct pipe_winsys *ws = screen->winsys;
+            uint bytes = bufWidth * bufHeight * 4 * numFaces;
+            ct->tiled_buffer[level] = ws->buffer_create(ws, 16,
+                                                        PIPE_BUFFER_USAGE_PIXEL,
+                                                        bytes);
+            /* and map it */
+            ct->tiled_mapped[level] = ws->buffer_map(ws, ct->tiled_buffer[level],
+                                                     PIPE_BUFFER_USAGE_GPU_READ);
+         }
+         dst = (uint *) ((ubyte *) ct->tiled_mapped[level] + offset);
+
+         twiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst,
+                            surface->stride, src);
+      }
+      break;
+   default:
+      printf("Cell: twiddle unsupported texture format %s\n", pf_name(ct->base.format));
+      ;
    }
-   texture->tiled_data = align_malloc(w * h * 4, 16);
 
-   tile_copy_data(w, h, TILE_SIZE, texture->tiled_data, src);
+   pipe_buffer_unmap(screen, surface->buffer);
+}
+
+
+/**
+ * Convert SPU tiled texture image data to linear format for app usage.
+ */
+static void
+cell_untwiddle_texture(struct pipe_screen *screen,
+                     struct pipe_surface *surface)
+{
+   struct cell_texture *ct = cell_texture(surface->texture);
+   const uint level = surface->level;
+   const uint texWidth = ct->base.width[level];
+   const uint texHeight = ct->base.height[level];
+   const void *map = pipe_buffer_map(screen, surface->buffer,
+                                     PIPE_BUFFER_USAGE_CPU_READ);
+   const uint *src = (const uint *) ((const ubyte *) map + surface->offset);
+
+   switch (ct->base.format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+      {
+         int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
+         int offset = surface->stride * texHeight * 4 * surface->face;
+         uint *dst;
+
+         if (!ct->untiled_data[level]) {
+            ct->untiled_data[level] =
+               align_malloc(surface->stride * texHeight * 4 * numFaces, 16);
+         }
+
+         dst = (uint *) ((ubyte *) ct->untiled_data[level] + offset);
 
-   pipe_surface_unmap(surf);
+         untwiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst,
+                              surface->stride, src);
+      }
+      break;
+   default:
+      {
+         ct->untiled_data[level] = NULL;
+         printf("Cell: untwiddle unsupported texture format %s\n", pf_name(ct->base.format));
+      }
+   }
 
-   pipe_surface_reference(&surf, NULL);
+   pipe_buffer_unmap(screen, surface->buffer);
 }
 
 
-void
-cell_update_texture_mapping(struct cell_context *cell)
+static struct pipe_surface *
+cell_get_tex_surface(struct pipe_screen *screen,
+                     struct pipe_texture *pt,
+                     unsigned face, unsigned level, unsigned zslice,
+                     unsigned usage)
 {
-#if 0
-   uint face = 0, level = 0, zslice = 0;
-#endif
-   uint i;
-
-   for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
-      if (cell->texture[i])
-         cell_tile_texture(cell, cell->texture[i]);
-   }
+   struct pipe_winsys *ws = screen->winsys;
+   struct cell_texture *ct = cell_texture(pt);
+   struct pipe_surface *ps;
 
-#if 0
-   if (cell->tex_surf && cell->tex_map) {
-      pipe_surface_unmap(cell->tex_surf);
-      cell->tex_map = NULL;
-   }
+   ps = ws->surface_alloc(ws);
+   if (ps) {
+      assert(ps->refcount);
+      assert(ps->winsys);
+      winsys_buffer_reference(ws, &ps->buffer, ct->buffer);
+      ps->format = pt->format;
+      ps->block = pt->block;
+      ps->width = pt->width[level];
+      ps->height = pt->height[level];
+      ps->nblocksx = pt->nblocksx[level];
+      ps->nblocksy = pt->nblocksy[level];
+      ps->stride = ct->stride[level];
+      ps->offset = ct->level_offset[level];
+      ps->usage = usage;
 
-   /* XXX free old surface */
+      /* XXX may need to override usage flags (see sp_texture.c) */
 
-   cell->tex_surf = cell_get_tex_surface(&cell->pipe,
-                                         &cell->texture[0]->base,
-                                         face, level, zslice);
+      pipe_texture_reference(&ps->texture, pt); 
+      ps->face = face;
+      ps->level = level;
+      ps->zslice = zslice;
 
-   cell->tex_map = pipe_surface_map(cell->tex_surf);
-#endif
+      if (pt->target == PIPE_TEXTURE_CUBE || pt->target == PIPE_TEXTURE_3D) {
+	         ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) *
+		      ps->nblocksy *
+		      ps->stride;
+      }
+      else {
+         assert(face == 0);
+         assert(zslice == 0);
+      }
+
+      if (ps->usage & PIPE_BUFFER_USAGE_CPU_READ) {
+         /* convert from tiled to linear layout */
+         cell_untwiddle_texture(screen, ps);
+      }
+   }
+   return ps;
 }
 
 
@@ -313,11 +447,17 @@ static void
 cell_tex_surface_release(struct pipe_screen *screen, 
                          struct pipe_surface **s)
 {
-   /* Effectively do the texture_update work here - if texture images
-    * needed post-processing to put them into hardware layout, this is
-    * where it would happen.  For softpipe, nothing to do.
-    */
-   assert ((*s)->texture);
+   struct cell_texture *ct = cell_texture((*s)->texture);
+   const uint level = (*s)->level;
+
+   if (((*s)->usage & PIPE_BUFFER_USAGE_CPU_READ) && (ct->untiled_data[level]))
+   {
+      align_free(ct->untiled_data[level]);
+      ct->untiled_data[level] = NULL;
+   }
+
+   /* XXX if done rendering to teximage, re-tile */
+
    pipe_texture_reference(&(*s)->texture, NULL); 
 
    screen->winsys->surface_release(screen->winsys, s);
@@ -325,11 +465,15 @@ cell_tex_surface_release(struct pipe_screen *screen,
 
 
 static void *
-cell_surface_map( struct pipe_screen *screen,
-                  struct pipe_surface *surface,
-                  unsigned flags )
+cell_surface_map(struct pipe_screen *screen,
+                 struct pipe_surface *surface,
+                 unsigned flags)
 {
    ubyte *map;
+   struct cell_texture *ct = cell_texture(surface->texture);
+   const uint level = surface->level;
+
+   assert(ct);
 
    if (flags & ~surface->usage) {
       assert(0);
@@ -339,22 +483,15 @@ cell_surface_map( struct pipe_screen *screen,
    map = pipe_buffer_map( screen, surface->buffer, flags );
    if (map == NULL)
       return NULL;
-
-   /* May want to different things here depending on read/write nature
-    * of the map:
-    */
-   if (surface->texture &&
-       (flags & PIPE_BUFFER_USAGE_CPU_WRITE)) 
+   else
    {
-      /* Do something to notify sharing contexts of a texture change.
-       * In softpipe, that would mean flushing the texture cache.
-       */
-#if 0
-      cell_screen(screen)->timestamp++;
-#endif
+      if ((surface->usage & PIPE_BUFFER_USAGE_CPU_READ) && (ct->untiled_data[level])) {
+         return (void *) ((ubyte *) ct->untiled_data[level] + surface->offset);
+      }
+      else {
+         return (void *) (map + surface->offset);
+      }
    }
-   
-   return map + surface->offset;
 }
 
 
@@ -362,17 +499,21 @@ static void
 cell_surface_unmap(struct pipe_screen *screen,
                    struct pipe_surface *surface)
 {
-   pipe_buffer_unmap( screen, surface->buffer );
-}
+   struct cell_texture *ct = cell_texture(surface->texture);
 
+   assert(ct);
 
-void
-cell_init_texture_functions(struct cell_context *cell)
-{
-   /*cell->pipe.texture_update = cell_texture_update;*/
+   if ((ct->base.tex_usage & PIPE_TEXTURE_USAGE_SAMPLER) &&
+       (surface->usage & PIPE_BUFFER_USAGE_CPU_WRITE)) {
+      /* convert from linear to tiled layout */
+      cell_twiddle_texture(screen, surface);
+   }
+
+   pipe_buffer_unmap( screen, surface->buffer );
 }
 
 
+
 void
 cell_init_screen_texture_funcs(struct pipe_screen *screen)
 {
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h
index 6d37e95ebce..7018b0c9bf7 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.h
+++ b/src/gallium/drivers/cell/ppu/cell_texture.h
@@ -40,15 +40,19 @@ struct cell_texture
 {
    struct pipe_texture base;
 
-   unsigned long level_offset[PIPE_MAX_TEXTURE_LEVELS];
-   unsigned long stride[PIPE_MAX_TEXTURE_LEVELS];
+   unsigned long level_offset[CELL_MAX_TEXTURE_LEVELS];
+   unsigned long stride[CELL_MAX_TEXTURE_LEVELS];
 
    /* The data is held here:
     */
    struct pipe_buffer *buffer;
    unsigned long buffer_size;
 
-   void *tiled_data;  /* XXX this may be temporary */ /*ALIGN16*/
+   /** Texture data in tiled layout is held here */
+   struct pipe_buffer *tiled_buffer[CELL_MAX_TEXTURE_LEVELS];
+   /** Mapped, tiled texture data */
+   void *tiled_mapped[CELL_MAX_TEXTURE_LEVELS];
+   void *untiled_data[CELL_MAX_TEXTURE_LEVELS];
 };
 
 
@@ -62,14 +66,6 @@ cell_texture(struct pipe_texture *pt)
 
 
 extern void
-cell_update_texture_mapping(struct cell_context *cell);
-
-
-extern void
-cell_init_texture_functions(struct cell_context *cell);
-
-
-extern void
 cell_init_screen_texture_funcs(struct pipe_screen *screen);
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index aa63435b934..65ba51b6bb2 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -38,6 +38,7 @@
 
 #include "cell_batch.h"
 #include "cell_context.h"
+#include "cell_fence.h"
 #include "cell_flush.h"
 #include "cell_spu.h"
 #include "cell_vbuf.h"
@@ -108,6 +109,11 @@ cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices,
           __FUNCTION__, cvbr->vertex_buf, vertices_used);
    */
 
+   /* Make sure texture buffers aren't released until we're done rendering
+    * with them.
+    */
+   cell_add_fenced_textures(cell);
+
    /* Tell SPUs they can release the vert buf */
    if (cvbr->vertex_buf != ~0U) {
       struct cell_command_release_verts *release
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
index 566df7f59e3..9cba537d9eb 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -73,8 +73,8 @@ emit_matrix_transpose(struct spe_function *p,
    int col3;
 
 
-   spe_lqd(p, shuf_hi, shuf_ptr, 3);
-   spe_lqd(p, shuf_lo, shuf_ptr, 4);
+   spe_lqd(p, shuf_hi, shuf_ptr, 3*16);
+   spe_lqd(p, shuf_lo, shuf_ptr, 4*16);
    spe_shufb(p, t1, row0, row2, shuf_hi);
    spe_shufb(p, t2, row0, row2, shuf_lo);
 
@@ -122,13 +122,13 @@ emit_matrix_transpose(struct spe_function *p,
     */
    switch (count) {
    case 4:
-      spe_stqd(p, col3, dest_ptr, 3);
+      spe_stqd(p, col3, dest_ptr, 3 * 16);
    case 3:
-      spe_stqd(p, col2, dest_ptr, 2);
+      spe_stqd(p, col2, dest_ptr, 2 * 16);
    case 2:
-      spe_stqd(p, col1, dest_ptr, 1);
+      spe_stqd(p, col1, dest_ptr, 1 * 16);
    case 1:
-      spe_stqd(p, col0, dest_ptr, 0);
+      spe_stqd(p, col0, dest_ptr, 0 * 16);
    }
 
 
@@ -145,6 +145,8 @@ emit_matrix_transpose(struct spe_function *p,
 }
 
 
+#if 0
+/* This appears to not be used currently */
 static void
 emit_fetch(struct spe_function *p,
 	   unsigned in_ptr, unsigned *offset,
@@ -166,17 +168,17 @@ emit_fetch(struct spe_function *p,
    float scale_signed = 0.0;
    float scale_unsigned = 0.0;
 
-   spe_lqd(p, v0, in_ptr, 0 + offset[0]);
-   spe_lqd(p, v1, in_ptr, 1 + offset[0]);
-   spe_lqd(p, v2, in_ptr, 2 + offset[0]);
-   spe_lqd(p, v3, in_ptr, 3 + offset[0]);
+   spe_lqd(p, v0, in_ptr, (0 + offset[0]) * 16);
+   spe_lqd(p, v1, in_ptr, (1 + offset[0]) * 16);
+   spe_lqd(p, v2, in_ptr, (2 + offset[0]) * 16);
+   spe_lqd(p, v3, in_ptr, (3 + offset[0]) * 16);
    offset[0] += 4;
    
    switch (bytes) {
    case 1:
       scale_signed = 1.0f / 127.0f;
       scale_unsigned = 1.0f / 255.0f;
-      spe_lqd(p, tmp, shuf_ptr, 1);
+      spe_lqd(p, tmp, shuf_ptr, 1 * 16);
       spe_shufb(p, v0, v0, v0, tmp);
       spe_shufb(p, v1, v1, v1, tmp);
       spe_shufb(p, v2, v2, v2, tmp);
@@ -185,7 +187,7 @@ emit_fetch(struct spe_function *p,
    case 2:
       scale_signed = 1.0f / 32767.0f;
       scale_unsigned = 1.0f / 65535.0f;
-      spe_lqd(p, tmp, shuf_ptr, 2);
+      spe_lqd(p, tmp, shuf_ptr, 2 * 16);
       spe_shufb(p, v0, v0, v0, tmp);
       spe_shufb(p, v1, v1, v1, tmp);
       spe_shufb(p, v2, v2, v2, tmp);
@@ -241,11 +243,11 @@ emit_fetch(struct spe_function *p,
 
    switch (count) {
    case 1:
-      spe_stqd(p, float_zero, out_ptr, 1);
+      spe_stqd(p, float_zero, out_ptr, 1 * 16);
    case 2:
-      spe_stqd(p, float_zero, out_ptr, 2);
+      spe_stqd(p, float_zero, out_ptr, 2 * 16);
    case 3:
-      spe_stqd(p, float_one, out_ptr, 3);
+      spe_stqd(p, float_one, out_ptr, 3 * 16);
    }
 
    if (float_zero != -1) {
@@ -256,6 +258,7 @@ emit_fetch(struct spe_function *p,
       spe_release_register(p, float_one);
    }
 }
+#endif
 
 
 void cell_update_vertex_fetch(struct draw_context *draw)
diff --git a/src/gallium/drivers/cell/spu/.gitignore b/src/gallium/drivers/cell/spu/.gitignore
new file mode 100644
index 00000000000..2be9a2d3242
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/.gitignore
@@ -0,0 +1 @@
+g3d_spu
diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
index 1ae0dfb8c10..116453b79c5 100644
--- a/src/gallium/drivers/cell/spu/Makefile
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -16,8 +16,10 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
 
 
 SOURCES = \
-	spu_main.c \
+	spu_command.c \
 	spu_dcache.c \
+	spu_funcs.c \
+	spu_main.c \
 	spu_per_fragment_op.c \
 	spu_render.c \
 	spu_texture.c \
diff --git a/src/gallium/drivers/cell/spu/spu_colorpack.h b/src/gallium/drivers/cell/spu/spu_colorpack.h
index fd8dc6ded3e..d7ce0055248 100644
--- a/src/gallium/drivers/cell/spu/spu_colorpack.h
+++ b/src/gallium/drivers/cell/spu/spu_colorpack.h
@@ -31,6 +31,7 @@
 #define SPU_COLORPACK_H
 
 
+#include <transpose_matrix4x4.h>
 #include <spu_intrinsics.h>
 
 
@@ -84,10 +85,10 @@ spu_unpack_B8G8R8A8(uint color)
    vector unsigned int color_u4 = spu_splats(color);
    color_u4 = spu_shuffle(color_u4, color_u4,
                           ((vector unsigned char) {
-                             10, 10, 10, 10,
-                             5, 5, 5, 5,
+                             2, 2, 2, 2,
+                             1, 1, 1, 1,
                              0, 0, 0, 0,
-                             15, 15, 15, 15}) );
+                             3, 3, 3, 3}) );
    return spu_convtf(color_u4, 32);
 }
 
@@ -98,13 +99,47 @@ spu_unpack_A8R8G8B8(uint color)
    vector unsigned int color_u4 = spu_splats(color);
    color_u4 = spu_shuffle(color_u4, color_u4,
                           ((vector unsigned char) {
-                             5, 5, 5, 5,
-                             10, 10, 10, 10,
-                             15, 15, 15, 15,
+                             1, 1, 1, 1,
+                             2, 2, 2, 2,
+                             3, 3, 3, 3,
                              0, 0, 0, 0}) );
-
    return spu_convtf(color_u4, 32);
 }
 
 
+/**
+ * \param color_in - array of 32-bit packed ARGB colors
+ * \param color_out - returns float colors in RRRR, GGGG, BBBB, AAAA order
+ */
+static INLINE void
+spu_unpack_A8R8G8B8_transpose4(const vector unsigned int color_in[4],
+                               vector float color_out[4])
+{
+   vector unsigned int c0;
+
+   c0 = spu_shuffle(color_in[0], color_in[0],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[0] = spu_convtf(c0, 32);
+
+   c0 = spu_shuffle(color_in[1], color_in[1],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[1] = spu_convtf(c0, 32);
+
+   c0 = spu_shuffle(color_in[2], color_in[2],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[2] = spu_convtf(c0, 32);
+
+   c0 = spu_shuffle(color_in[3], color_in[3],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[3] = spu_convtf(c0, 32);
+
+   _transpose_matrix4x4(color_out, color_out);
+}
+
+
+
 #endif /* SPU_COLORPACK_H */
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
new file mode 100644
index 00000000000..8500d19754e
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -0,0 +1,815 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * SPU command processing code
+ */
+
+
+#include <stdio.h>
+#include <libmisc.h>
+
+#include "pipe/p_defines.h"
+
+#include "spu_command.h"
+#include "spu_main.h"
+#include "spu_render.h"
+#include "spu_per_fragment_op.h"
+#include "spu_texture.h"
+#include "spu_tile.h"
+#include "spu_vertex_shader.h"
+#include "spu_dcache.h"
+#include "cell/common.h"
+
+
+struct spu_vs_context draw;
+
+
+/**
+ * Buffers containing dynamically generated SPU code:
+ */
+static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
+    ALIGN16_ATTRIB;
+
+
+
+static INLINE int
+align(int value, int alignment)
+{
+   return (value + alignment - 1) & ~(alignment - 1);
+}
+
+
+
+/**
+ * Tell the PPU that this SPU has finished copying a buffer to
+ * local store and that it may be reused by the PPU.
+ * This is done by writting a 16-byte batch-buffer-status block back into
+ * main memory (in cell_context->buffer_status[]).
+ */
+static void
+release_buffer(uint buffer)
+{
+   /* Evidently, using less than a 16-byte status doesn't work reliably */
+   static const vector unsigned int status = {CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE};
+   const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
+   uint *dst = spu.init.buffer_status + index;
+
+   ASSERT(buffer < CELL_NUM_BUFFERS);
+
+   mfc_put((void *) &status,    /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           sizeof(status),      /* size */
+           TAG_MISC,            /* tag is unimportant */
+           0, /* tid */
+           0  /* rid */);
+}
+
+
+/**
+ * Write CELL_FENCE_SIGNALLED back to the fence status qword in main memory.
+ * There's a qword of status per SPU.
+ */
+static void
+cmd_fence(struct cell_command_fence *fence_cmd)
+{
+   static const vector unsigned int status = {CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED};
+   uint *dst = (uint *) fence_cmd->fence;
+   dst += 4 * spu.init.id;  /* main store/memory address, not local store */
+   ASSERT_ALIGN16(dst);
+   mfc_put((void *) &status,    /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           sizeof(status),      /* size */
+           TAG_FENCE,           /* tag */
+           0, /* tid */
+           0  /* rid */);
+}
+
+
+static void
+cmd_clear_surface(const struct cell_command_clear_surface *clear)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
+
+   if (clear->surface == 0) {
+      spu.fb.color_clear_value = clear->value;
+      if (spu.init.debug_flags & CELL_DEBUG_CHECKER) {
+         uint x = (spu.init.id << 4) | (spu.init.id << 12) |
+            (spu.init.id << 20) | (spu.init.id << 28);
+         spu.fb.color_clear_value ^= x;
+      }
+   }
+   else {
+      spu.fb.depth_clear_value = clear->value;
+   }
+
+#define CLEAR_OPT 1
+#if CLEAR_OPT
+
+   /* Simply set all tiles' status to CLEAR.
+    * When we actually begin rendering into a tile, we'll initialize it to
+    * the clear value.  If any tiles go untouched during the frame,
+    * really_clear_tiles() will set them to the clear value.
+    */
+   if (clear->surface == 0) {
+      memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
+   }
+   else {
+      memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
+   }
+
+#else
+
+   /*
+    * This path clears the whole framebuffer to the clear color right now.
+    */
+
+   /*
+   printf("SPU: %s num=%d w=%d h=%d\n",
+          __FUNCTION__, num_tiles, spu.fb.width_tiles, spu.fb.height_tiles);
+   */
+
+   /* init a single tile to the clear value */
+   if (clear->surface == 0) {
+      clear_c_tile(&spu.ctile);
+   }
+   else {
+      clear_z_tile(&spu.ztile);
+   }
+
+   /* walk over my tiles, writing the 'clear' tile's data */
+   {
+      const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+      uint i;
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (clear->surface == 0)
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
+         else
+            put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
+      }
+   }
+
+   if (spu.init.debug_flags & CELL_DEBUG_SYNC) {
+      wait_on_mask(1 << TAG_SURFACE_CLEAR);
+   }
+
+#endif /* CLEAR_OPT */
+
+   D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF done\n");
+}
+
+
+static void
+cmd_release_verts(const struct cell_command_release_verts *release)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "RELEASE VERTS %u\n", release->vertex_buf);
+   ASSERT(release->vertex_buf != ~0U);
+   release_buffer(release->vertex_buf);
+}
+
+
+/**
+ * Process a CELL_CMD_STATE_FRAGMENT_OPS command.
+ * This involves installing new fragment ops SPU code.
+ * If this function is never called, we'll use a regular C fallback function
+ * for fragment processing.
+ */
+static void
+cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_OPS\n");
+
+   /* Copy state info (for fallback case only - this will eventually
+    * go away when the fallback case goes away)
+    */
+   memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
+   memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
+   memcpy(&spu.blend_color, &fops->blend_color, sizeof(fops->blend_color));
+
+   /* Make sure the SPU knows which buffers it's expected to read when
+    * it's told to pull tiles.
+    */
+   spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled);
+
+   /* If we're forcing the fallback code to be used (for debug purposes),
+    * install that.  Otherwise install the incoming SPU code.
+    */
+   if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) != 0) {
+      static unsigned int warned = 0;
+      if (!warned) {
+         fprintf(stderr, "Cell Warning: using fallback per-fragment code\n");
+         warned = 1;
+      }
+      /* The following two lines aren't really necessary if you
+       * know the debug flags won't change during a run, and if you
+       * know that the function pointers are initialized correctly.
+       * We set them here to allow a person to change the debug
+       * flags during a run (from inside a debugger).
+       */
+      spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+      spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+      return;
+   }
+
+   /* Make sure the SPU code buffer is large enough to hold the incoming code.
+    * Note that we *don't* use align_malloc() and align_free(), because
+    * those utility functions are *not* available in SPU code.
+    * */
+   if (spu.fragment_ops_code_size < fops->total_code_size) {
+      if (spu.fragment_ops_code != NULL) {
+         free(spu.fragment_ops_code);
+      }
+      spu.fragment_ops_code_size = fops->total_code_size;
+      spu.fragment_ops_code = malloc(fops->total_code_size);
+      if (spu.fragment_ops_code == NULL) {
+         /* Whoops. */
+         fprintf(stderr, "CELL Warning: failed to allocate fragment ops code (%d bytes) - using fallback\n", fops->total_code_size);
+         spu.fragment_ops_code = NULL;
+         spu.fragment_ops_code_size = 0;
+         spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+         spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+         return;
+      }
+   }
+
+   /* Copy the SPU code from the command buffer to the spu buffer */
+   memcpy(spu.fragment_ops_code, fops->code, fops->total_code_size);
+
+   /* Set the pointers for the front-facing and back-facing fragments
+    * to the specified offsets within the code.  Note that if the
+    * front-facing and back-facing code are the same, they'll have
+    * the same offset.
+    */
+   spu.fragment_ops[CELL_FACING_FRONT] = (spu_fragment_ops_func) &spu.fragment_ops_code[fops->front_code_index];
+   spu.fragment_ops[CELL_FACING_BACK] = (spu_fragment_ops_func) &spu.fragment_ops_code[fops->back_code_index];
+}
+
+static void
+cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_PROGRAM\n");
+   /* Copy SPU code from batch buffer to spu buffer */
+   memcpy(spu.fragment_program_code, fp->code,
+          SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
+#if 01
+   /* Point function pointer at new code */
+   spu.fragment_program = (spu_fragment_program_func)spu.fragment_program_code;
+#endif
+}
+
+
+static uint
+cmd_state_fs_constants(const uint64_t *buffer, uint pos)
+{
+   const uint num_const = buffer[pos + 1];
+   const float *constants = (const float *) &buffer[pos + 2];
+   uint i;
+
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
+
+   /* Expand each float to float[4] for SOA execution */
+   for (i = 0; i < num_const; i++) {
+      D_PRINTF(CELL_DEBUG_CMD, "  const[%u] = %f\n", i, constants[i]);
+      spu.constants[i] = spu_splats(constants[i]);
+   }
+
+   /* return new buffer pos (in 8-byte words) */
+   return pos + 2 + num_const / 2;
+}
+
+
+static void
+cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
+             cmd->width,
+             cmd->height,
+             cmd->color_start,
+             cmd->color_format,
+             cmd->depth_format);
+
+   ASSERT_ALIGN16(cmd->color_start);
+   ASSERT_ALIGN16(cmd->depth_start);
+
+   spu.fb.color_start = cmd->color_start;
+   spu.fb.depth_start = cmd->depth_start;
+   spu.fb.color_format = cmd->color_format;
+   spu.fb.depth_format = cmd->depth_format;
+   spu.fb.width = cmd->width;
+   spu.fb.height = cmd->height;
+   spu.fb.width_tiles = (spu.fb.width + TILE_SIZE - 1) / TILE_SIZE;
+   spu.fb.height_tiles = (spu.fb.height + TILE_SIZE - 1) / TILE_SIZE;
+
+   switch (spu.fb.depth_format) {
+   case PIPE_FORMAT_Z32_UNORM:
+      spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0xffffffffu;
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+      spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0x00ffffffu;
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      spu.fb.zsize = 2;
+      spu.fb.zscale = (float) 0xffffu;
+      break;
+   default:
+      spu.fb.zsize = 0;
+      break;
+   }
+}
+
+
+/**
+ * Tex texture mask_s/t and scale_s/t fields depend on the texture size and
+ * sampler wrap modes.
+ */
+static void
+update_tex_masks(struct spu_texture *texture,
+                 const struct pipe_sampler_state *sampler)
+{
+   uint i;
+
+   for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+      int width = texture->level[i].width;
+      int height = texture->level[i].height;
+
+      if (sampler->wrap_s == PIPE_TEX_WRAP_REPEAT)
+         texture->level[i].mask_s = spu_splats(width - 1);
+      else
+         texture->level[i].mask_s = spu_splats(~0);
+
+      if (sampler->wrap_t == PIPE_TEX_WRAP_REPEAT)
+         texture->level[i].mask_t = spu_splats(height - 1);
+      else
+         texture->level[i].mask_t = spu_splats(~0);
+
+      if (sampler->normalized_coords) {
+         texture->level[i].scale_s = spu_splats((float) width);
+         texture->level[i].scale_t = spu_splats((float) height);
+      }
+      else {
+         texture->level[i].scale_s = spu_splats(1.0f);
+         texture->level[i].scale_t = spu_splats(1.0f);
+      }
+   }
+}
+
+
+static void
+cmd_state_sampler(const struct cell_command_sampler *sampler)
+{
+   uint unit = sampler->unit;
+
+   D_PRINTF(CELL_DEBUG_CMD, "SAMPLER [%u]\n", unit);
+
+   spu.sampler[unit] = sampler->state;
+
+   switch (spu.sampler[unit].min_img_filter) {
+   case PIPE_TEX_FILTER_LINEAR:
+      spu.min_sample_texture_2d[unit] = sample_texture_2d_bilinear;
+      break;
+   case PIPE_TEX_FILTER_ANISO:
+      /* fall-through, for now */
+   case PIPE_TEX_FILTER_NEAREST:
+      spu.min_sample_texture_2d[unit] = sample_texture_2d_nearest;
+      break;
+   default:
+      ASSERT(0);
+   }
+
+   switch (spu.sampler[sampler->unit].mag_img_filter) {
+   case PIPE_TEX_FILTER_LINEAR:
+      spu.mag_sample_texture_2d[unit] = sample_texture_2d_bilinear;
+      break;
+   case PIPE_TEX_FILTER_ANISO:
+      /* fall-through, for now */
+   case PIPE_TEX_FILTER_NEAREST:
+      spu.mag_sample_texture_2d[unit] = sample_texture_2d_nearest;
+      break;
+   default:
+      ASSERT(0);
+   }
+
+   switch (spu.sampler[sampler->unit].min_mip_filter) {
+   case PIPE_TEX_MIPFILTER_NEAREST:
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      spu.sample_texture_2d[unit] = sample_texture_2d_lod;
+      break;
+   case PIPE_TEX_MIPFILTER_NONE:
+      spu.sample_texture_2d[unit] = spu.mag_sample_texture_2d[unit];
+      break;
+   default:
+      ASSERT(0);
+   }
+
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
+}
+
+
+static void
+cmd_state_texture(const struct cell_command_texture *texture)
+{
+   const uint unit = texture->unit;
+   uint i;
+
+   D_PRINTF(CELL_DEBUG_CMD, "TEXTURE [%u]\n", texture->unit);
+
+   spu.texture[unit].max_level = 0;
+   spu.texture[unit].target = texture->target;
+
+   for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+      uint width = texture->width[i];
+      uint height = texture->height[i];
+      uint depth = texture->depth[i];
+
+      D_PRINTF(CELL_DEBUG_CMD, "  LEVEL %u: at %p  size[0] %u x %u\n", i,
+             texture->start[i], texture->width[i], texture->height[i]);
+
+      spu.texture[unit].level[i].start = texture->start[i];
+      spu.texture[unit].level[i].width = width;
+      spu.texture[unit].level[i].height = height;
+      spu.texture[unit].level[i].depth = depth;
+
+      spu.texture[unit].level[i].tiles_per_row =
+         (width + TILE_SIZE - 1) / TILE_SIZE;
+
+      spu.texture[unit].level[i].bytes_per_image =
+         4 * align(width, TILE_SIZE) * align(height, TILE_SIZE) * depth;
+
+      spu.texture[unit].level[i].max_s = spu_splats((int) width - 1);
+      spu.texture[unit].level[i].max_t = spu_splats((int) height - 1);
+
+      if (texture->start[i])
+         spu.texture[unit].max_level = i;
+   }
+
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
+}
+
+
+static void
+cmd_state_vertex_info(const struct vertex_info *vinfo)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
+   ASSERT(vinfo->num_attribs >= 1);
+   ASSERT(vinfo->num_attribs <= 8);
+   memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
+}
+
+
+static void
+cmd_state_vs_array_info(const struct cell_array_info *vs_info)
+{
+   const unsigned attr = vs_info->attr;
+
+   ASSERT(attr < PIPE_MAX_ATTRIBS);
+   draw.vertex_fetch.src_ptr[attr] = vs_info->base;
+   draw.vertex_fetch.pitch[attr] = vs_info->pitch;
+   draw.vertex_fetch.size[attr] = vs_info->size;
+   draw.vertex_fetch.code_offset[attr] = vs_info->function_offset;
+   draw.vertex_fetch.dirty = 1;
+}
+
+
+static void
+cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
+{
+   mfc_get(attribute_fetch_code_buffer,
+           (unsigned int) code->base,  /* src */
+           code->size,
+           TAG_BATCH_BUFFER,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+   draw.vertex_fetch.code = attribute_fetch_code_buffer;
+}
+
+
+static void
+cmd_finish(void)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "FINISH\n");
+   really_clear_tiles(0);
+   /* wait for all outstanding DMAs to finish */
+   mfc_write_tag_mask(~0);
+   mfc_read_tag_status_all();
+   /* send mbox message to PPU */
+   spu_write_out_mbox(CELL_CMD_FINISH);
+}
+
+
+/**
+ * Execute a batch of commands which was sent to us by the PPU.
+ * See the cell_emit_state.c code to see where the commands come from.
+ *
+ * The opcode param encodes the location of the buffer and its size.
+ */
+static void
+cmd_batch(uint opcode)
+{
+   const uint buf = (opcode >> 8) & 0xff;
+   uint size = (opcode >> 16);
+   uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB;
+   const unsigned usize = size / sizeof(buffer[0]);
+   uint pos;
+
+   D_PRINTF(CELL_DEBUG_CMD, "BATCH buffer %u, len %u, from %p\n",
+             buf, size, spu.init.buffers[buf]);
+
+   ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
+
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
+
+   size = ROUNDUP16(size);
+
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
+
+   mfc_get(buffer,  /* dest */
+           (unsigned int) spu.init.buffers[buf],  /* src */
+           size,
+           TAG_BATCH_BUFFER,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+   /* Tell PPU we're done copying the buffer to local store */
+   D_PRINTF(CELL_DEBUG_CMD, "release batch buf %u\n", buf);
+   release_buffer(buf);
+
+   /*
+    * Loop over commands in the batch buffer
+    */
+   for (pos = 0; pos < usize; /* no incr */) {
+      switch (buffer[pos]) {
+      /*
+       * rendering commands
+       */
+      case CELL_CMD_CLEAR_SURFACE:
+         {
+            struct cell_command_clear_surface *clr
+               = (struct cell_command_clear_surface *) &buffer[pos];
+            cmd_clear_surface(clr);
+            pos += sizeof(*clr) / 8;
+         }
+         break;
+      case CELL_CMD_RENDER:
+         {
+            struct cell_command_render *render
+               = (struct cell_command_render *) &buffer[pos];
+            uint pos_incr;
+            cmd_render(render, &pos_incr);
+            pos += pos_incr;
+         }
+         break;
+      /*
+       * state-update commands
+       */
+      case CELL_CMD_STATE_FRAMEBUFFER:
+         {
+            struct cell_command_framebuffer *fb
+               = (struct cell_command_framebuffer *) &buffer[pos];
+            cmd_state_framebuffer(fb);
+            pos += sizeof(*fb) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_FRAGMENT_OPS:
+         {
+            struct cell_command_fragment_ops *fops
+               = (struct cell_command_fragment_ops *) &buffer[pos];
+            cmd_state_fragment_ops(fops);
+            /* This is a variant-sized command */
+            pos += (sizeof(*fops) + fops->total_code_size)/ 8;
+         }
+         break;
+      case CELL_CMD_STATE_FRAGMENT_PROGRAM:
+         {
+            struct cell_command_fragment_program *fp
+               = (struct cell_command_fragment_program *) &buffer[pos];
+            cmd_state_fragment_program(fp);
+            pos += sizeof(*fp) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_FS_CONSTANTS:
+         pos = cmd_state_fs_constants(buffer, pos);
+         break;
+      case CELL_CMD_STATE_RASTERIZER:
+         {
+            struct cell_command_rasterizer *rast =
+               (struct cell_command_rasterizer *) &buffer[pos];
+            spu.rasterizer = rast->rasterizer;
+            pos += sizeof(*rast) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_SAMPLER:
+         {
+            struct cell_command_sampler *sampler
+               = (struct cell_command_sampler *) &buffer[pos];
+            cmd_state_sampler(sampler);
+            pos += sizeof(*sampler) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_TEXTURE:
+         {
+            struct cell_command_texture *texture
+               = (struct cell_command_texture *) &buffer[pos];
+            cmd_state_texture(texture);
+            pos += sizeof(*texture) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_VERTEX_INFO:
+         cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8);
+         break;
+      case CELL_CMD_STATE_VIEWPORT:
+         (void) memcpy(& draw.viewport, &buffer[pos+1],
+                       sizeof(struct pipe_viewport_state));
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8);
+         break;
+      case CELL_CMD_STATE_UNIFORMS:
+         draw.constants = (const float (*)[4]) (uintptr_t) buffer[pos + 1];
+         pos += 2;
+         break;
+      case CELL_CMD_STATE_VS_ARRAY_INFO:
+         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
+         break;
+      case CELL_CMD_STATE_BIND_VS:
+#if 0
+         spu_bind_vertex_shader(&draw,
+                                (struct cell_shader_info *) &buffer[pos+1]);
+#endif
+         pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
+         break;
+      case CELL_CMD_STATE_ATTRIB_FETCH:
+         cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
+                                &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
+         break;
+      /*
+       * misc commands
+       */
+      case CELL_CMD_FINISH:
+         cmd_finish();
+         pos += 1;
+         break;
+      case CELL_CMD_FENCE:
+         {
+            struct cell_command_fence *fence_cmd =
+               (struct cell_command_fence *) &buffer[pos];
+            cmd_fence(fence_cmd);
+            pos += sizeof(*fence_cmd) / 8;
+         }
+         break;
+      case CELL_CMD_RELEASE_VERTS:
+         {
+            struct cell_command_release_verts *release
+               = (struct cell_command_release_verts *) &buffer[pos];
+            cmd_release_verts(release);
+            pos += sizeof(*release) / 8;
+         }
+         break;
+      case CELL_CMD_FLUSH_BUFFER_RANGE: {
+	 struct cell_buffer_range *br = (struct cell_buffer_range *)
+	     &buffer[pos+1];
+
+	 spu_dcache_mark_dirty((unsigned) br->base, br->size);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_buffer_range)) / 8);
+	 break;
+      }
+      default:
+         printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
+         ASSERT(0);
+         break;
+      }
+   }
+
+   D_PRINTF(CELL_DEBUG_CMD, "BATCH complete\n");
+}
+
+
+#define PERF 0
+
+
+/**
+ * Main loop for SPEs: Get a command, execute it, repeat.
+ */
+void
+command_loop(void)
+{
+   int exitFlag = 0;
+   uint t0, t1;
+
+   D_PRINTF(CELL_DEBUG_CMD, "Enter command loop\n");
+
+   while (!exitFlag) {
+      unsigned opcode;
+
+      D_PRINTF(CELL_DEBUG_CMD, "Wait for cmd...\n");
+
+      if (PERF)
+         spu_write_decrementer(~0);
+
+      /* read/wait from mailbox */
+      opcode = (unsigned int) spu_read_in_mbox();
+      D_PRINTF(CELL_DEBUG_CMD, "got cmd 0x%x\n", opcode);
+
+      if (PERF)
+         t0 = spu_read_decrementer();
+
+      switch (opcode & CELL_CMD_OPCODE_MASK) {
+      case CELL_CMD_EXIT:
+         D_PRINTF(CELL_DEBUG_CMD, "EXIT\n");
+         exitFlag = 1;
+         break;
+      case CELL_CMD_VS_EXECUTE:
+#if 0
+         spu_execute_vertex_shader(&draw, &cmd.vs);
+#endif
+         break;
+      case CELL_CMD_BATCH:
+         cmd_batch(opcode);
+         break;
+      default:
+         printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK);
+      }
+
+      if (PERF) {
+         t1 = spu_read_decrementer();
+         printf("wait mbox time: %gms   batch time: %gms\n",
+                (~0u - t0) * spu.init.inv_timebase,
+                (t0 - t1) * spu.init.inv_timebase);
+      }
+   }
+
+   D_PRINTF(CELL_DEBUG_CMD, "Exit command loop\n");
+
+   if (spu.init.debug_flags & CELL_DEBUG_CACHE)
+      spu_dcache_report();
+}
+
+/* Initialize this module; we manage the fragment ops buffer here. */
+void
+spu_command_init(void)
+{
+   /* Install default/fallback fragment processing function.
+    * This will normally be overriden by a code-gen'd function
+    * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set.
+    */
+   spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+   spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+
+   /* Set up the basic empty buffer for code-gen'ed fragment ops */
+   spu.fragment_ops_code = NULL;
+   spu.fragment_ops_code_size = 0;
+}
+
+void
+spu_command_close(void)
+{
+   /* Deallocate the code-gen buffer for fragment ops, and reset the
+    * fragment ops functions to their initial setting (just to leave
+    * things in a good state).
+    */
+   if (spu.fragment_ops_code != NULL) {
+      free(spu.fragment_ops_code);
+   }
+   spu_command_init();
+}
diff --git a/src/gallium/drivers/cell/spu/spu_command.h b/src/gallium/drivers/cell/spu/spu_command.h
new file mode 100644
index 00000000000..83dcdade288
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_command.h
@@ -0,0 +1,35 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+extern void
+command_loop(void);
+
+extern void
+spu_command_init(void);
+
+extern void
+spu_command_close(void);
diff --git a/src/gallium/drivers/cell/spu/spu_dcache.c b/src/gallium/drivers/cell/spu/spu_dcache.c
index 167404cdc54..a6d67634fd8 100644
--- a/src/gallium/drivers/cell/spu/spu_dcache.c
+++ b/src/gallium/drivers/cell/spu/spu_dcache.c
@@ -36,7 +36,9 @@
 #define CACHE_SET_TAGID(set)  (((set) & 0x03) + TAG_DCACHE0)
 #define CACHE_LOG2NNWAY       2
 #define CACHE_LOG2NSETS       6
-/*#define CACHE_STATS           1*/
+#ifdef DEBUG
+#define CACHE_STATS           1
+#endif
 #include <cache-api.h>
 
 /* Yes folks, this is ugly.
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
new file mode 100644
index 00000000000..ff3d609d258
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -0,0 +1,173 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * SPU functions accessed by shaders.
+ *
+ * Authors: Brian Paul
+ */
+
+
+#include <string.h>
+#include <libmisc.h>
+#include <math.h>
+#include <cos14_v.h>
+#include <sin14_v.h>
+#include <simdmath/exp2f4.h>
+#include <simdmath/log2f4.h>
+#include <simdmath/powf4.h>
+
+#include "cell/common.h"
+#include "spu_main.h"
+#include "spu_funcs.h"
+#include "spu_texture.h"
+
+
+/** For "return"-ing four vectors */
+struct vec_4x4
+{
+   vector float v[4];
+};
+
+
+static vector float
+spu_cos(vector float x)
+{
+   return _cos14_v(x);
+}
+
+static vector float
+spu_sin(vector float x)
+{
+   return _sin14_v(x);
+}
+
+static vector float
+spu_pow(vector float x, vector float y)
+{
+   return _powf4(x, y);
+}
+
+static vector float
+spu_exp2(vector float x)
+{
+   return _exp2f4(x);
+}
+
+static vector float
+spu_log2(vector float x)
+{
+   return _log2f4(x);
+}
+
+
+static struct vec_4x4
+spu_tex_2d(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
+{
+   struct vec_4x4 colors;
+   (void) r;
+   (void) q;
+   spu.sample_texture_2d[unit](s, t, unit, 0, 0, colors.v);
+   return colors;
+}
+
+static struct vec_4x4
+spu_tex_3d(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
+{
+   struct vec_4x4 colors;
+   (void) r;
+   (void) q;
+   spu.sample_texture_2d[unit](s, t, unit, 0, 0, colors.v);
+   return colors;
+}
+
+static struct vec_4x4
+spu_tex_cube(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
+{
+   struct vec_4x4 colors;
+   (void) q;
+   sample_texture_cube(s, t, r, unit, colors.v);
+   return colors;
+}
+
+
+/**
+ * Add named function to list of "exported" functions that will be
+ * made available to the PPU-hosted code generator.
+ */
+static void
+export_func(struct cell_spu_function_info *spu_functions,
+            const char *name, void *addr)
+{
+   uint n = spu_functions->num;
+   ASSERT(strlen(name) < 16);
+   strcpy(spu_functions->names[n], name);
+   spu_functions->addrs[n] = (uint) addr;
+   spu_functions->num++;
+   ASSERT(spu_functions->num <= 16);
+}
+
+
+/**
+ * Return info about the SPU's function to the PPU / main memory.
+ * The PPU needs to know the address of some SPU-side functions so
+ * that we can generate shader code with function calls.
+ */
+void
+return_function_info(void)
+{
+   struct cell_spu_function_info funcs ALIGN16_ATTRIB;
+   int tag = TAG_MISC;
+
+   ASSERT(sizeof(funcs) == 256); /* must be multiple of 16 bytes */
+
+   funcs.num = 0;
+   export_func(&funcs, "spu_cos", &spu_cos);
+   export_func(&funcs, "spu_sin", &spu_sin);
+   export_func(&funcs, "spu_pow", &spu_pow);
+   export_func(&funcs, "spu_exp2", &spu_exp2);
+   export_func(&funcs, "spu_log2", &spu_log2);
+   export_func(&funcs, "spu_tex_2d", &spu_tex_2d);
+   export_func(&funcs, "spu_tex_3d", &spu_tex_3d);
+   export_func(&funcs, "spu_tex_cube", &spu_tex_cube);
+
+   /* Send the function info back to the PPU / main memory */
+   mfc_put((void *) &funcs,  /* src in local store */
+           (unsigned int) spu.init.spu_functions, /* dst in main memory */
+           sizeof(funcs),  /* bytes */
+           tag,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << tag);
+}
+
+
+
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.h b/src/gallium/drivers/cell/spu/spu_funcs.h
new file mode 100644
index 00000000000..3adb6ae99f9
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_funcs.h
@@ -0,0 +1,35 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_FUNCS_H
+#define SPU_FUNCS_H
+
+extern void
+return_function_info(void);
+
+#endif
+
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 78260c4259c..97c86d194da 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -32,16 +32,15 @@
 #include <stdio.h>
 #include <libmisc.h>
 
+#include "pipe/p_defines.h"
+
+#include "spu_funcs.h"
+#include "spu_command.h"
 #include "spu_main.h"
-#include "spu_render.h"
 #include "spu_per_fragment_op.h"
 #include "spu_texture.h"
-#include "spu_tile.h"
 //#include "spu_test.h"
-#include "spu_vertex_shader.h"
-#include "spu_dcache.h"
 #include "cell/common.h"
-#include "pipe/p_defines.h"
 
 
 /*
@@ -50,600 +49,8 @@ helpful headers:
 /opt/cell/sdk/usr/include/libmisc.h
 */
 
-boolean Debug = FALSE;
-
 struct spu_global spu;
 
-struct spu_vs_context draw;
-
-
-/**
- * Buffers containing dynamically generated SPU code:
- */
-static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
-    ALIGN16_ATTRIB;
-
-
-
-/**
- * Tell the PPU that this SPU has finished copying a buffer to
- * local store and that it may be reused by the PPU.
- * This is done by writting a 16-byte batch-buffer-status block back into
- * main memory (in cell_context->buffer_status[]).
- */
-static void
-release_buffer(uint buffer)
-{
-   /* Evidently, using less than a 16-byte status doesn't work reliably */
-   static const uint status[4] ALIGN16_ATTRIB
-      = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
-
-   const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
-   uint *dst = spu.init.buffer_status + index;
-
-   ASSERT(buffer < CELL_NUM_BUFFERS);
-
-   mfc_put((void *) &status,    /* src in local memory */
-           (unsigned int) dst,  /* dst in main memory */
-           sizeof(status),      /* size */
-           TAG_MISC,            /* tag is unimportant */
-           0, /* tid */
-           0  /* rid */);
-}
-
-
-/**
- * For tiles whose status is TILE_STATUS_CLEAR, write solid-filled
- * tiles back to the main framebuffer.
- */
-static void
-really_clear_tiles(uint surfaceIndex)
-{
-   const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
-   uint i;
-
-   if (surfaceIndex == 0) {
-      clear_c_tile(&spu.ctile);
-
-      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
-         uint tx = i % spu.fb.width_tiles;
-         uint ty = i / spu.fb.width_tiles;
-         if (spu.ctile_status[ty][tx] == TILE_STATUS_CLEAR) {
-            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
-         }
-      }
-   }
-   else {
-      clear_z_tile(&spu.ztile);
-
-      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
-         uint tx = i % spu.fb.width_tiles;
-         uint ty = i / spu.fb.width_tiles;
-         if (spu.ztile_status[ty][tx] == TILE_STATUS_CLEAR)
-            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 1);
-      }
-   }
-
-#if 0
-   wait_on_mask(1 << TAG_SURFACE_CLEAR);
-#endif
-}
-
-
-static void
-cmd_clear_surface(const struct cell_command_clear_surface *clear)
-{
-   if (Debug)
-      printf("SPU %u: CLEAR SURF %u to 0x%08x\n", spu.init.id,
-             clear->surface, clear->value);
-
-   if (clear->surface == 0) {
-      spu.fb.color_clear_value = clear->value;
-      if (spu.init.debug_flags & CELL_DEBUG_CHECKER) {
-         uint x = (spu.init.id << 4) | (spu.init.id << 12) |
-            (spu.init.id << 20) | (spu.init.id << 28);
-         spu.fb.color_clear_value ^= x;
-      }
-   }
-   else {
-      spu.fb.depth_clear_value = clear->value;
-   }
-
-#define CLEAR_OPT 1
-#if CLEAR_OPT
-
-   /* Simply set all tiles' status to CLEAR.
-    * When we actually begin rendering into a tile, we'll initialize it to
-    * the clear value.  If any tiles go untouched during the frame,
-    * really_clear_tiles() will set them to the clear value.
-    */
-   if (clear->surface == 0) {
-      memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
-   }
-   else {
-      memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
-   }
-
-#else
-
-   /*
-    * This path clears the whole framebuffer to the clear color right now.
-    */
-
-   /*
-   printf("SPU: %s num=%d w=%d h=%d\n",
-          __FUNCTION__, num_tiles, spu.fb.width_tiles, spu.fb.height_tiles);
-   */
-
-   /* init a single tile to the clear value */
-   if (clear->surface == 0) {
-      clear_c_tile(&spu.ctile);
-   }
-   else {
-      clear_z_tile(&spu.ztile);
-   }
-
-   /* walk over my tiles, writing the 'clear' tile's data */
-   {
-      const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
-      uint i;
-      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
-         uint tx = i % spu.fb.width_tiles;
-         uint ty = i / spu.fb.width_tiles;
-         if (clear->surface == 0)
-            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
-         else
-            put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
-      }
-   }
-
-   if (spu.init.debug_flags & CELL_DEBUG_SYNC) {
-      wait_on_mask(1 << TAG_SURFACE_CLEAR);
-   }
-
-#endif /* CLEAR_OPT */
-
-   if (Debug)
-      printf("SPU %u: CLEAR SURF done\n", spu.init.id);
-}
-
-
-static void
-cmd_release_verts(const struct cell_command_release_verts *release)
-{
-   if (Debug)
-      printf("SPU %u: RELEASE VERTS %u\n",
-             spu.init.id, release->vertex_buf);
-   ASSERT(release->vertex_buf != ~0U);
-   release_buffer(release->vertex_buf);
-}
-
-
-/**
- * Process a CELL_CMD_STATE_FRAGMENT_OPS command.
- * This involves installing new fragment ops SPU code.
- * If this function is never called, we'll use a regular C fallback function
- * for fragment processing.
- */
-static void
-cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
-{
-   if (Debug)
-      printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id);
-   /* Copy SPU code from batch buffer to spu buffer */
-   memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
-   /* Copy state info (for fallback case only) */
-   memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
-   memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
-
-   /* Point function pointer at new code */
-   spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
-
-   spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
-   spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
-}
-
-
-static void
-cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
-{
-   if (Debug)
-      printf("SPU %u: CMD_STATE_FRAGMENT_PROGRAM\n", spu.init.id);
-   /* Copy SPU code from batch buffer to spu buffer */
-   memcpy(spu.fragment_program_code, fp->code,
-          SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
-#if 01
-   /* Point function pointer at new code */
-   spu.fragment_program = (spu_fragment_program_func)spu.fragment_program_code;
-#endif
-}
-
-
-static void
-cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
-{
-   if (Debug)
-      printf("SPU %u: FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
-             spu.init.id,
-             cmd->width,
-             cmd->height,
-             cmd->color_start,
-             cmd->color_format,
-             cmd->depth_format);
-
-   ASSERT_ALIGN16(cmd->color_start);
-   ASSERT_ALIGN16(cmd->depth_start);
-
-   spu.fb.color_start = cmd->color_start;
-   spu.fb.depth_start = cmd->depth_start;
-   spu.fb.color_format = cmd->color_format;
-   spu.fb.depth_format = cmd->depth_format;
-   spu.fb.width = cmd->width;
-   spu.fb.height = cmd->height;
-   spu.fb.width_tiles = (spu.fb.width + TILE_SIZE - 1) / TILE_SIZE;
-   spu.fb.height_tiles = (spu.fb.height + TILE_SIZE - 1) / TILE_SIZE;
-
-   switch (spu.fb.depth_format) {
-   case PIPE_FORMAT_Z32_UNORM:
-      spu.fb.zsize = 4;
-      spu.fb.zscale = (float) 0xffffffffu;
-      break;
-   case PIPE_FORMAT_Z24S8_UNORM:
-   case PIPE_FORMAT_S8Z24_UNORM:
-   case PIPE_FORMAT_Z24X8_UNORM:
-   case PIPE_FORMAT_X8Z24_UNORM:
-      spu.fb.zsize = 4;
-      spu.fb.zscale = (float) 0x00ffffffu;
-      break;
-   case PIPE_FORMAT_Z16_UNORM:
-      spu.fb.zsize = 2;
-      spu.fb.zscale = (float) 0xffffu;
-      break;
-   default:
-      spu.fb.zsize = 0;
-      break;
-   }
-}
-
-
-static void
-cmd_state_sampler(const struct cell_command_sampler *sampler)
-{
-   if (Debug)
-      printf("SPU %u: SAMPLER [%u]\n",
-             spu.init.id, sampler->unit);
-
-   spu.sampler[sampler->unit] = sampler->state;
-   if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR)
-      spu.sample_texture[sampler->unit] = sample_texture_bilinear;
-   else
-      spu.sample_texture[sampler->unit] = sample_texture_nearest;
-}
-
-
-static void
-cmd_state_texture(const struct cell_command_texture *texture)
-{
-   const uint unit = texture->unit;
-   const uint width = texture->width;
-   const uint height = texture->height;
-
-   if (Debug) {
-      printf("SPU %u: TEXTURE [%u] at %p  size %u x %u\n", spu.init.id,
-             texture->unit, texture->start,
-             texture->width, texture->height);
-   }
-
-   spu.texture[unit].start = texture->start;
-   spu.texture[unit].width = width;
-   spu.texture[unit].height = height;
-
-   spu.texture[unit].tiles_per_row = width / TILE_SIZE;
-
-   spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0};
-   spu.texture[unit].tex_size_mask = (vector unsigned int)
-         { width - 1, height - 1, 0, 0 };
-   spu.texture[unit].tex_size_x_mask = spu_splats(width - 1);
-   spu.texture[unit].tex_size_y_mask = spu_splats(height - 1);
-}
-
-
-static void
-cmd_state_vertex_info(const struct vertex_info *vinfo)
-{
-   if (Debug) {
-      printf("SPU %u: VERTEX_INFO num_attribs=%u\n", spu.init.id,
-             vinfo->num_attribs);
-   }
-   ASSERT(vinfo->num_attribs >= 1);
-   ASSERT(vinfo->num_attribs <= 8);
-   memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
-}
-
-
-static void
-cmd_state_vs_array_info(const struct cell_array_info *vs_info)
-{
-   const unsigned attr = vs_info->attr;
-
-   ASSERT(attr < PIPE_MAX_ATTRIBS);
-   draw.vertex_fetch.src_ptr[attr] = vs_info->base;
-   draw.vertex_fetch.pitch[attr] = vs_info->pitch;
-   draw.vertex_fetch.size[attr] = vs_info->size;
-   draw.vertex_fetch.code_offset[attr] = vs_info->function_offset;
-   draw.vertex_fetch.dirty = 1;
-}
-
-
-static void
-cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
-{
-   mfc_get(attribute_fetch_code_buffer,
-           (unsigned int) code->base,  /* src */
-           code->size,
-           TAG_BATCH_BUFFER,
-           0, /* tid */
-           0  /* rid */);
-   wait_on_mask(1 << TAG_BATCH_BUFFER);
-
-   draw.vertex_fetch.code = attribute_fetch_code_buffer;
-}
-
-
-static void
-cmd_finish(void)
-{
-   if (Debug)
-      printf("SPU %u: FINISH\n", spu.init.id);
-   really_clear_tiles(0);
-   /* wait for all outstanding DMAs to finish */
-   mfc_write_tag_mask(~0);
-   mfc_read_tag_status_all();
-   /* send mbox message to PPU */
-   spu_write_out_mbox(CELL_CMD_FINISH);
-}
-
-
-/**
- * Execute a batch of commands which was sent to us by the PPU.
- * See the cell_emit_state.c code to see where the commands come from.
- *
- * The opcode param encodes the location of the buffer and its size.
- */
-static void
-cmd_batch(uint opcode)
-{
-   const uint buf = (opcode >> 8) & 0xff;
-   uint size = (opcode >> 16);
-   uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB;
-   const unsigned usize = size / sizeof(buffer[0]);
-   uint pos;
-
-   if (Debug)
-      printf("SPU %u: BATCH buffer %u, len %u, from %p\n",
-             spu.init.id, buf, size, spu.init.buffers[buf]);
-
-   ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
-
-   ASSERT_ALIGN16(spu.init.buffers[buf]);
-
-   size = ROUNDUP16(size);
-
-   ASSERT_ALIGN16(spu.init.buffers[buf]);
-
-   mfc_get(buffer,  /* dest */
-           (unsigned int) spu.init.buffers[buf],  /* src */
-           size,
-           TAG_BATCH_BUFFER,
-           0, /* tid */
-           0  /* rid */);
-   wait_on_mask(1 << TAG_BATCH_BUFFER);
-
-   /* Tell PPU we're done copying the buffer to local store */
-   if (Debug)
-      printf("SPU %u: release batch buf %u\n", spu.init.id, buf);
-   release_buffer(buf);
-
-   /*
-    * Loop over commands in the batch buffer
-    */
-   for (pos = 0; pos < usize; /* no incr */) {
-      switch (buffer[pos]) {
-      /*
-       * rendering commands
-       */
-      case CELL_CMD_CLEAR_SURFACE:
-         {
-            struct cell_command_clear_surface *clr
-               = (struct cell_command_clear_surface *) &buffer[pos];
-            cmd_clear_surface(clr);
-            pos += sizeof(*clr) / 8;
-         }
-         break;
-      case CELL_CMD_RENDER:
-         {
-            struct cell_command_render *render
-               = (struct cell_command_render *) &buffer[pos];
-            uint pos_incr;
-            cmd_render(render, &pos_incr);
-            pos += pos_incr;
-         }
-         break;
-      /*
-       * state-update commands
-       */
-      case CELL_CMD_STATE_FRAMEBUFFER:
-         {
-            struct cell_command_framebuffer *fb
-               = (struct cell_command_framebuffer *) &buffer[pos];
-            cmd_state_framebuffer(fb);
-            pos += sizeof(*fb) / 8;
-         }
-         break;
-      case CELL_CMD_STATE_FRAGMENT_OPS:
-         {
-            struct cell_command_fragment_ops *fops
-               = (struct cell_command_fragment_ops *) &buffer[pos];
-            cmd_state_fragment_ops(fops);
-            pos += sizeof(*fops) / 8;
-         }
-         break;
-      case CELL_CMD_STATE_FRAGMENT_PROGRAM:
-         {
-            struct cell_command_fragment_program *fp
-               = (struct cell_command_fragment_program *) &buffer[pos];
-            cmd_state_fragment_program(fp);
-            pos += sizeof(*fp) / 8;
-         }
-         break;
-      case CELL_CMD_STATE_SAMPLER:
-         {
-            struct cell_command_sampler *sampler
-               = (struct cell_command_sampler *) &buffer[pos];
-            cmd_state_sampler(sampler);
-            pos += sizeof(*sampler) / 8;
-         }
-         break;
-      case CELL_CMD_STATE_TEXTURE:
-         {
-            struct cell_command_texture *texture
-               = (struct cell_command_texture *) &buffer[pos];
-            cmd_state_texture(texture);
-            pos += sizeof(*texture) / 8;
-         }
-         break;
-      case CELL_CMD_STATE_VERTEX_INFO:
-         cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8);
-         break;
-      case CELL_CMD_STATE_VIEWPORT:
-         (void) memcpy(& draw.viewport, &buffer[pos+1],
-                       sizeof(struct pipe_viewport_state));
-         pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8);
-         break;
-      case CELL_CMD_STATE_UNIFORMS:
-         draw.constants = (const float (*)[4]) (uintptr_t) buffer[pos + 1];
-         pos += 2;
-         break;
-      case CELL_CMD_STATE_VS_ARRAY_INFO:
-         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
-         break;
-      case CELL_CMD_STATE_BIND_VS:
-#if 0
-         spu_bind_vertex_shader(&draw,
-                                (struct cell_shader_info *) &buffer[pos+1]);
-#endif
-         pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
-         break;
-      case CELL_CMD_STATE_ATTRIB_FETCH:
-         cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
-                                &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
-         break;
-      /*
-       * misc commands
-       */
-      case CELL_CMD_FINISH:
-         cmd_finish();
-         pos += 1;
-         break;
-      case CELL_CMD_RELEASE_VERTS:
-         {
-            struct cell_command_release_verts *release
-               = (struct cell_command_release_verts *) &buffer[pos];
-            cmd_release_verts(release);
-            pos += sizeof(*release) / 8;
-         }
-         break;
-      case CELL_CMD_FLUSH_BUFFER_RANGE: {
-	 struct cell_buffer_range *br = (struct cell_buffer_range *)
-	     &buffer[pos+1];
-
-	 spu_dcache_mark_dirty((unsigned) br->base, br->size);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_buffer_range)) / 8);
-	 break;
-      }
-      default:
-         printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
-         ASSERT(0);
-         break;
-      }
-   }
-
-   if (Debug)
-      printf("SPU %u: BATCH complete\n", spu.init.id);
-}
-
-
-/**
- * Temporary/simple main loop for SPEs: Get a command, execute it, repeat.
- */
-static void
-main_loop(void)
-{
-   struct cell_command cmd;
-   int exitFlag = 0;
-
-   if (Debug)
-      printf("SPU %u: Enter main loop\n", spu.init.id);
-
-   ASSERT((sizeof(struct cell_command) & 0xf) == 0);
-   ASSERT_ALIGN16(&cmd);
-
-   while (!exitFlag) {
-      unsigned opcode;
-      int tag = 0;
-
-      if (Debug)
-         printf("SPU %u: Wait for cmd...\n", spu.init.id);
-
-      /* read/wait from mailbox */
-      opcode = (unsigned int) spu_read_in_mbox();
-
-      if (Debug)
-         printf("SPU %u: got cmd 0x%x\n", spu.init.id, opcode);
-
-      /* command payload */
-      mfc_get(&cmd,  /* dest */
-              (unsigned int) spu.init.cmd, /* src */
-              sizeof(struct cell_command), /* bytes */
-              tag,
-              0, /* tid */
-              0  /* rid */);
-      wait_on_mask( 1 << tag );
-
-      /*
-       * NOTE: most commands should be contained in a batch buffer
-       */
-
-      switch (opcode & CELL_CMD_OPCODE_MASK) {
-      case CELL_CMD_EXIT:
-         if (Debug)
-            printf("SPU %u: EXIT\n", spu.init.id);
-         exitFlag = 1;
-         break;
-      case CELL_CMD_VS_EXECUTE:
-#if 0
-         spu_execute_vertex_shader(&draw, &cmd.vs);
-#endif
-         break;
-      case CELL_CMD_BATCH:
-         cmd_batch(opcode);
-         break;
-      default:
-         printf("Bad opcode!\n");
-      }
-
-   }
-
-   if (Debug)
-      printf("SPU %u: Exit main loop\n", spu.init.id);
-
-   spu_dcache_report();
-}
-
-
 
 static void
 one_time_init(void)
@@ -651,15 +58,8 @@ one_time_init(void)
    memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status));
    memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status));
    invalidate_tex_cache();
-
-   /* Install default/fallback fragment processing function.
-    * This will normally be overriden by a code-gen'd function.
-    */
-   spu.fragment_ops = spu_fallback_fragment_ops;
 }
 
-
-
 /* In some versions of the SDK the SPE main takes 'unsigned long' as a
  * parameter.  In others it takes 'unsigned long long'.  Use a define to
  * select between the two.
@@ -682,12 +82,16 @@ main(main_param_t speid, main_param_t argp)
 
    ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
    ASSERT(sizeof(struct cell_command_render) % 8 == 0);
+   ASSERT(sizeof(struct cell_command_fragment_ops) % 8 == 0);
+   ASSERT(((unsigned long) &spu.fragment_program_code) % 8 == 0);
 
    one_time_init();
+   spu_command_init();
 
-   if (Debug)
-      printf("SPU: main() speid=%lu\n", (unsigned long) speid);
+   D_PRINTF(CELL_DEBUG_CMD, "main() speid=%lu\n", (unsigned long) speid);
+   D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n");
 
+   /* get initialization data */
    mfc_get(&spu.init,  /* dest */
            (unsigned int) argp, /* src */
            sizeof(struct cell_init_info), /* bytes */
@@ -696,12 +100,18 @@ main(main_param_t speid, main_param_t argp)
            0  /* rid */);
    wait_on_mask( 1 << tag );
 
+   if (spu.init.id == 0) {
+      return_function_info();
+   }
+
 #if 0
    if (spu.init.id==0)
-      spu_test_misc();
+      spu_test_misc(spu.init.id);
 #endif
 
-   main_loop();
+   command_loop();
+
+   spu_command_close();
 
    return 0;
 }
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 2c7b6258402..33767e7c51d 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -36,9 +36,18 @@
 #include "pipe/p_state.h"
 
 
-
-#define MAX_WIDTH 1024
-#define MAX_HEIGHT 1024
+#if DEBUG
+/* These debug macros use the unusual construction ", ##__VA_ARGS__"
+ * which expands to the expected comma + args if variadic arguments
+ * are supplied, but swallows the comma if there are no variadic
+ * arguments (which avoids syntax errors that would otherwise occur).
+ */
+#define D_PRINTF(flag, format,...) \
+   if (spu.init.debug_flags & (flag)) \
+      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+#else
+#define D_PRINTF(...)
+#endif
 
 
 /**
@@ -61,8 +70,11 @@ typedef union {
 
 
 /** Function for sampling textures */
-typedef vector float (*spu_sample_texture_func)(uint unit,
-                                                vector float texcoord);
+typedef void (*spu_sample_texture_2d_func)(vector float s,
+                                           vector float t,
+                                           uint unit, uint level, uint face,
+                                           vector float colors[4]);
+
 
 /** Function for performing per-fragment ops */
 typedef void (*spu_fragment_ops_func)(uint x, uint y,
@@ -76,9 +88,9 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       vector unsigned int mask);
 
 /** Function for running fragment program */
-typedef void (*spu_fragment_program_func)(vector float *inputs,
-                                          vector float *outputs,
-                                          vector float *constants);
+typedef vector unsigned int (*spu_fragment_program_func)(vector float *inputs,
+                                                         vector float *outputs,
+                                                         vector float *constants);
 
 
 struct spu_framebuffer
@@ -98,15 +110,27 @@ struct spu_framebuffer
 } ALIGN16_ATTRIB;
 
 
-struct spu_texture
+/** per-texture level info */
+struct spu_texture_level
 {
    void *start;
-   ushort width, height;
+   ushort width, height, depth;
    ushort tiles_per_row;
-   vector float tex_size;
-   vector unsigned int tex_size_mask; /**< == int(size - 1) */
-   vector unsigned int tex_size_x_mask; /**< == int(size - 1) */
-   vector unsigned int tex_size_y_mask; /**< == int(size - 1) */
+   uint bytes_per_image;
+   /** texcoord scale factors */
+   vector float scale_s, scale_t, scale_r;
+   /** texcoord masks (if REPEAT then size-1, else ~0) */
+   vector signed int mask_s, mask_t, mask_r;
+   /** texcoord clamp limits */
+   vector signed int max_s, max_t, max_r;
+} ALIGN16_ATTRIB;
+
+
+struct spu_texture
+{
+   struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
+   uint max_level;
+   uint target;  /**< PIPE_TEXTURE_x */
 } ALIGN16_ATTRIB;
 
 
@@ -124,7 +148,9 @@ struct spu_global
    struct spu_framebuffer fb;
    struct pipe_depth_stencil_alpha_state depth_stencil_alpha;
    struct pipe_blend_state blend;
+   struct pipe_blend_color blend_color;
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
+   struct pipe_rasterizer_state rasterizer;
    struct spu_texture texture[PIPE_MAX_SAMPLERS];
    struct vertex_info vertex_info;
 
@@ -133,39 +159,38 @@ struct spu_global
    tile_t ztile ALIGN16_ATTRIB;
 
    /** Read depth/stencil tiles? */
-   boolean read_depth;
-   boolean read_stencil;
+   boolean read_depth_stencil;
 
    /** Current tiles' status */
    ubyte cur_ctile_status, cur_ztile_status;
 
    /** Status of all tiles in framebuffer */
-   ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
-   ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+   ubyte ctile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+   ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
-   /** Current fragment ops machine code */
-   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS];
-   /** Current fragment ops function */
-   spu_fragment_ops_func fragment_ops;
+   /** Current fragment ops machine code, at 8-byte boundary */
+   uint *fragment_ops_code;
+   uint fragment_ops_code_size;
+   /** Current fragment ops functions, 0 = frontfacing, 1 = backfacing */
+   spu_fragment_ops_func fragment_ops[2];
 
-   /** Current fragment program machine code */
-   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
+   /** Current fragment program machine code, at 8-byte boundary */
+   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN8_ATTRIB;
    /** Current fragment ops function */
    spu_fragment_program_func fragment_program;
 
    /** Current texture sampler function */
-   spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
+   spu_sample_texture_2d_func sample_texture_2d[CELL_MAX_SAMPLERS];
+   spu_sample_texture_2d_func min_sample_texture_2d[CELL_MAX_SAMPLERS];
+   spu_sample_texture_2d_func mag_sample_texture_2d[CELL_MAX_SAMPLERS];
 
-   /** Fragment program constants (XXX preliminary/used) */
-#define MAX_CONSTANTS 32
-   vector float constants[MAX_CONSTANTS];
+   /** Fragment program constants */
+   vector float constants[4 * CELL_MAX_CONSTANTS];
 
 } ALIGN16_ATTRIB;
 
 
 extern struct spu_global spu;
-extern boolean Debug;
-
 
 
 
@@ -184,7 +209,7 @@ extern boolean Debug;
 #define TAG_DCACHE1           21
 #define TAG_DCACHE2           22
 #define TAG_DCACHE3           23
-
+#define TAG_FENCE             24
 
 
 static INLINE void
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index 03dd547845b..683664e8a4e 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -40,6 +40,24 @@
 #define LINEAR_QUAD_LAYOUT 1
 
 
+static INLINE vector float
+spu_min(vector float a, vector float b)
+{
+   vector unsigned int m;
+   m = spu_cmpgt(a, b);    /* m = a > b ? ~0 : 0 */
+   return spu_sel(a, b, m);
+}
+
+
+static INLINE vector float
+spu_max(vector float a, vector float b)
+{
+   vector unsigned int m;
+   m = spu_cmpgt(a, b);    /* m = a > b ? ~0 : 0 */
+   return spu_sel(b, a, m);
+}
+
+
 /**
  * Called by rasterizer for each quad after the shader has run.  Do
  * all the per-fragment operations including alpha test, z test,
@@ -60,9 +78,12 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector unsigned int mask)
 {
    vector float frag_aos[4];
-   unsigned int c0, c1, c2, c3;
+   unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */
+   unsigned int fragc0, fragc1, fragc2, fragc3;  /* fragment colors */
 
-   /* do alpha test */
+   /*
+    * Do alpha test
+    */
    if (spu.depth_stencil_alpha.alpha.enabled) {
       vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref);
       vector unsigned int amask;
@@ -102,7 +123,10 @@ spu_fallback_fragment_ops(uint x, uint y,
       mask = spu_and(mask, amask);
    }
 
-   /* Z and/or stencil testing... */
+
+   /*
+    * Z and/or stencil testing...
+    */
    if (spu.depth_stencil_alpha.depth.enabled ||
        spu.depth_stencil_alpha.stencil[0].enabled) {
 
@@ -178,6 +202,32 @@ spu_fallback_fragment_ops(uint x, uint y,
       }
    }
 
+
+   /*
+    * If we'll need the current framebuffer/tile colors for blending
+    * or logicop or colormask, fetch them now.
+    */
+   if (spu.blend.blend_enable ||
+       spu.blend.logicop_enable ||
+       spu.blend.colormask != 0xf) {
+
+#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */
+      fbc0 = colorTile->ui[y][x*2+0];
+      fbc1 = colorTile->ui[y][x*2+1];
+      fbc2 = colorTile->ui[y][x*2+2];
+      fbc3 = colorTile->ui[y][x*2+3];
+#else
+      fbc0 = colorTile->ui[y+0][x+0];
+      fbc1 = colorTile->ui[y+0][x+1];
+      fbc2 = colorTile->ui[y+1][x+0];
+      fbc3 = colorTile->ui[y+1][x+1];
+#endif
+   }
+
+
+   /*
+    * Do blending
+    */
    if (spu.blend.blend_enable) {
       /* blending terms, misc regs */
       vector float term1r, term1g, term1b, term1a;
@@ -186,43 +236,30 @@ spu_fallback_fragment_ops(uint x, uint y,
 
       vector float fbRGBA[4];  /* current framebuffer colors */
 
-      /* get colors from framebuffer/tile */
+      /* convert framebuffer colors from packed int to vector float */
       {
-         vector float fc[4];
-         uint c0, c1, c2, c3;
-
-#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */
-         c0 = colorTile->ui[y][x*2+0];
-         c1 = colorTile->ui[y][x*2+1];
-         c2 = colorTile->ui[y][x*2+2];
-         c3 = colorTile->ui[y][x*2+3];
-#else
-         c0 = colorTile->ui[y+0][x+0];
-         c1 = colorTile->ui[y+0][x+1];
-         c2 = colorTile->ui[y+1][x+0];
-         c3 = colorTile->ui[y+1][x+1];
-#endif
+         vector float temp[4]; /* float colors in AOS form */
          switch (spu.fb.color_format) {
          case PIPE_FORMAT_B8G8R8A8_UNORM:
-            fc[0] = spu_unpack_B8G8R8A8(c0);
-            fc[1] = spu_unpack_B8G8R8A8(c1);
-            fc[2] = spu_unpack_B8G8R8A8(c2);
-            fc[3] = spu_unpack_B8G8R8A8(c3);
+            temp[0] = spu_unpack_B8G8R8A8(fbc0);
+            temp[1] = spu_unpack_B8G8R8A8(fbc1);
+            temp[2] = spu_unpack_B8G8R8A8(fbc2);
+            temp[3] = spu_unpack_B8G8R8A8(fbc3);
             break;
          case PIPE_FORMAT_A8R8G8B8_UNORM:
-            fc[0] = spu_unpack_A8R8G8B8(c0);
-            fc[1] = spu_unpack_A8R8G8B8(c1);
-            fc[2] = spu_unpack_A8R8G8B8(c2);
-            fc[3] = spu_unpack_A8R8G8B8(c3);
+            temp[0] = spu_unpack_A8R8G8B8(fbc0);
+            temp[1] = spu_unpack_A8R8G8B8(fbc1);
+            temp[2] = spu_unpack_A8R8G8B8(fbc2);
+            temp[3] = spu_unpack_A8R8G8B8(fbc3);
             break;
          default:
             ASSERT(0);
          }
-         _transpose_matrix4x4(fbRGBA, fc);
+         _transpose_matrix4x4(fbRGBA, temp); /* fbRGBA = transpose(temp) */
       }
 
       /*
-       * Compute Src RGB terms
+       * Compute Src RGB terms (fragment color * factor)
        */
       switch (spu.blend.rgb_src_factor) {
       case PIPE_BLENDFACTOR_ONE:
@@ -245,13 +282,33 @@ spu_fallback_fragment_ops(uint x, uint y,
          term1g = spu_mul(fragG, fragA);
          term1b = spu_mul(fragB, fragA);
          break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         term1r = spu_mul(fragR, fbRGBA[0]);
+         term1g = spu_mul(fragG, fbRGBA[1]);
+         term1b = spu_mul(fragB, fbRGBA[1]);
+         break;
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term1r = spu_mul(fragR, fbRGBA[3]);
+         term1g = spu_mul(fragG, fbRGBA[3]);
+         term1b = spu_mul(fragB, fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[0]));
+         term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[1]));
+         term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[2]));
+         break;
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[3]));
+         term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[3]));
+         term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[3]));
+         break;
       /* XXX more cases */
       default:
          ASSERT(0);
       }
 
       /*
-       * Compute Src Alpha term
+       * Compute Src Alpha term (fragment alpha * factor)
        */
       switch (spu.blend.alpha_src_factor) {
       case PIPE_BLENDFACTOR_ONE:
@@ -263,19 +320,29 @@ spu_fallback_fragment_ops(uint x, uint y,
       case PIPE_BLENDFACTOR_SRC_ALPHA:
          term1a = spu_mul(fragA, fragA);
          break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term1a = spu_mul(fragA, fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term1a = spu_mul(fragR, spu_splats(spu.blend_color.color[3]));
+         break;
       /* XXX more cases */
       default:
          ASSERT(0);
       }
 
       /*
-       * Compute Dest RGB terms
+       * Compute Dest RGB terms (framebuffer color * factor)
        */
       switch (spu.blend.rgb_dst_factor) {
       case PIPE_BLENDFACTOR_ONE:
-         term2r = fragR;
-         term2g = fragG;
-         term2b = fragB;
+         term2r = fbRGBA[0];
+         term2g = fbRGBA[1];
+         term2b = fbRGBA[2];
          break;
       case PIPE_BLENDFACTOR_ZERO:
          term2r =
@@ -299,17 +366,37 @@ spu_fallback_fragment_ops(uint x, uint y,
          term2g = spu_mul(fbRGBA[1], tmp);
          term2b = spu_mul(fbRGBA[2], tmp);
          break;
-      /* XXX more cases */
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         term2r = spu_mul(fbRGBA[0], fbRGBA[0]);
+         term2g = spu_mul(fbRGBA[1], fbRGBA[1]);
+         term2b = spu_mul(fbRGBA[2], fbRGBA[2]);
+         break;
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term2r = spu_mul(fbRGBA[0], fbRGBA[3]);
+         term2g = spu_mul(fbRGBA[1], fbRGBA[3]);
+         term2b = spu_mul(fbRGBA[2], fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[0]));
+         term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[1]));
+         term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[2]));
+         break;
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[3]));
+         term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[3]));
+         term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[3]));
+         break;
+       /* XXX more cases */
       default:
          ASSERT(0);
       }
 
       /*
-       * Compute Dest Alpha term
+       * Compute Dest Alpha term (framebuffer alpha * factor)
        */
       switch (spu.blend.alpha_dst_factor) {
       case PIPE_BLENDFACTOR_ONE:
-         term2a = fragA;
+         term2a = fbRGBA[3];
          break;
       case PIPE_BLENDFACTOR_SRC_COLOR:
          term2a = spu_splats(0.0f);
@@ -322,6 +409,16 @@ spu_fallback_fragment_ops(uint x, uint y,
          tmp = spu_sub(one, fragA);
          term2a = spu_mul(fbRGBA[3], tmp);
          break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term2a = spu_mul(fbRGBA[3], fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term2a = spu_mul(fbRGBA[3], spu_splats(spu.blend_color.color[3]));
+         break;
       /* XXX more cases */
       default:
          ASSERT(0);
@@ -341,7 +438,21 @@ spu_fallback_fragment_ops(uint x, uint y,
          fragG = spu_sub(term1g, term2g);
          fragB = spu_sub(term1b, term2b);
          break;
-      /* XXX more cases */
+      case PIPE_BLEND_REVERSE_SUBTRACT:
+         fragR = spu_sub(term2r, term1r);
+         fragG = spu_sub(term2g, term1g);
+         fragB = spu_sub(term2b, term1b);
+         break;
+      case PIPE_BLEND_MIN:
+         fragR = spu_min(term1r, term2r);
+         fragG = spu_min(term1g, term2g);
+         fragB = spu_min(term1b, term2b);
+         break;
+      case PIPE_BLEND_MAX:
+         fragR = spu_max(term1r, term2r);
+         fragG = spu_max(term1g, term2g);
+         fragB = spu_max(term1b, term2b);
+         break;
       default:
          ASSERT(0);
       }
@@ -356,7 +467,15 @@ spu_fallback_fragment_ops(uint x, uint y,
       case PIPE_BLEND_SUBTRACT:
          fragA = spu_sub(term1a, term2a);
          break;
-      /* XXX more cases */
+      case PIPE_BLEND_REVERSE_SUBTRACT:
+         fragA = spu_sub(term2a, term1a);
+         break;
+      case PIPE_BLEND_MIN:
+         fragA = spu_min(term1a, term2a);
+         break;
+      case PIPE_BLEND_MAX:
+         fragA = spu_max(term1a, term2a);
+         break;
       default:
          ASSERT(0);
       }
@@ -384,21 +503,20 @@ spu_fallback_fragment_ops(uint x, uint y,
 #endif
 
    /*
-    * Pack float colors into 32-bit RGBA words.
+    * Pack fragment float colors into 32-bit RGBA words.
     */
    switch (spu.fb.color_format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
-      c0 = spu_pack_A8R8G8B8(frag_aos[0]);
-      c1 = spu_pack_A8R8G8B8(frag_aos[1]);
-      c2 = spu_pack_A8R8G8B8(frag_aos[2]);
-      c3 = spu_pack_A8R8G8B8(frag_aos[3]);
+      fragc0 = spu_pack_A8R8G8B8(frag_aos[0]);
+      fragc1 = spu_pack_A8R8G8B8(frag_aos[1]);
+      fragc2 = spu_pack_A8R8G8B8(frag_aos[2]);
+      fragc3 = spu_pack_A8R8G8B8(frag_aos[3]);
       break;
-
    case PIPE_FORMAT_B8G8R8A8_UNORM:
-      c0 = spu_pack_B8G8R8A8(frag_aos[0]);
-      c1 = spu_pack_B8G8R8A8(frag_aos[1]);
-      c2 = spu_pack_B8G8R8A8(frag_aos[2]);
-      c3 = spu_pack_B8G8R8A8(frag_aos[3]);
+      fragc0 = spu_pack_B8G8R8A8(frag_aos[0]);
+      fragc1 = spu_pack_B8G8R8A8(frag_aos[1]);
+      fragc2 = spu_pack_B8G8R8A8(frag_aos[2]);
+      fragc3 = spu_pack_B8G8R8A8(frag_aos[3]);
       break;
    default:
       fprintf(stderr, "SPU: Bad pixel format in spu_default_fragment_ops\n");
@@ -407,20 +525,57 @@ spu_fallback_fragment_ops(uint x, uint y,
 
 
    /*
-    * Color masking
+    * Do color masking
     */
    if (spu.blend.colormask != 0xf) {
-      /* XXX to do */
-      /* apply color mask to 32-bit packed colors */
+      uint cmask = 0x0; /* each byte corresponds to a color channel */
+
+      /* Form bitmask depending on color buffer format and colormask bits */
+      switch (spu.fb.color_format) {
+      case PIPE_FORMAT_A8R8G8B8_UNORM:
+         if (spu.blend.colormask & PIPE_MASK_R)
+            cmask |= 0x00ff0000; /* red */
+         if (spu.blend.colormask & PIPE_MASK_G)
+            cmask |= 0x0000ff00; /* green */
+         if (spu.blend.colormask & PIPE_MASK_B)
+            cmask |= 0x000000ff; /* blue */
+         if (spu.blend.colormask & PIPE_MASK_A)
+            cmask |= 0xff000000; /* alpha */
+         break;
+      case PIPE_FORMAT_B8G8R8A8_UNORM:
+         if (spu.blend.colormask & PIPE_MASK_R)
+            cmask |= 0x0000ff00; /* red */
+         if (spu.blend.colormask & PIPE_MASK_G)
+            cmask |= 0x00ff0000; /* green */
+         if (spu.blend.colormask & PIPE_MASK_B)
+            cmask |= 0xff000000; /* blue */
+         if (spu.blend.colormask & PIPE_MASK_A)
+            cmask |= 0x000000ff; /* alpha */
+         break;
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Apply color mask to the 32-bit packed colors.
+       * if (cmask[i])
+       *    frag color[i] = frag color[i];
+       * else
+       *    frag color[i] = framebuffer color[i];
+       */
+      fragc0 = (fragc0 & cmask) | (fbc0 & ~cmask);
+      fragc1 = (fragc1 & cmask) | (fbc1 & ~cmask);
+      fragc2 = (fragc2 & cmask) | (fbc2 & ~cmask);
+      fragc3 = (fragc3 & cmask) | (fbc3 & ~cmask);
    }
 
 
    /*
-    * Logic Ops
+    * Do logic ops
     */
    if (spu.blend.logicop_enable) {
       /* XXX to do */
-      /* apply logicop to 32-bit packed colors */
+      /* apply logicop to 32-bit packed colors (fragcx and fbcx) */
    }
 
 
@@ -431,45 +586,46 @@ spu_fallback_fragment_ops(uint x, uint y,
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
    }
    else {
+      /* write no fragments */
       return;
    }
 
 
    /*
-    * Write new quad colors to the framebuffer/tile.
+    * Write new fragment/quad colors to the framebuffer/tile.
     * Only write pixels where the corresponding mask word is set.
     */
 #if LINEAR_QUAD_LAYOUT
    /*
     * Quad layout:
     *  +--+--+--+--+
-    *  |p0|p1|p2|p3|
+    *  |p0|p1|p2|p3|...
     *  +--+--+--+--+
     */
    if (spu_extract(mask, 0))
-      colorTile->ui[y][x*2] = c0;
+      colorTile->ui[y][x*2] = fragc0;
    if (spu_extract(mask, 1))
-      colorTile->ui[y][x*2+1] = c1;
+      colorTile->ui[y][x*2+1] = fragc1;
    if (spu_extract(mask, 2))
-      colorTile->ui[y][x*2+2] = c2;
+      colorTile->ui[y][x*2+2] = fragc2;
    if (spu_extract(mask, 3))
-      colorTile->ui[y][x*2+3] = c3;
+      colorTile->ui[y][x*2+3] = fragc3;
 #else
    /*
     * Quad layout:
     *  +--+--+
-    *  |p0|p1|
+    *  |p0|p1|...
     *  +--+--+
-    *  |p2|p3|
+    *  |p2|p3|...
     *  +--+--+
     */
    if (spu_extract(mask, 0))
-      colorTile->ui[y+0][x+0] = c0;
+      colorTile->ui[y+0][x+0] = fragc0;
    if (spu_extract(mask, 1))
-      colorTile->ui[y+0][x+1] = c1;
+      colorTile->ui[y+0][x+1] = fragc1;
    if (spu_extract(mask, 2))
-      colorTile->ui[y+1][x+0] = c2;
+      colorTile->ui[y+1][x+0] = fragc2;
    if (spu_extract(mask, 3))
-      colorTile->ui[y+1][x+1] = c3;
+      colorTile->ui[y+1][x+1] = fragc3;
 #endif
 }
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 305dc988810..7c225e2f27c 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -98,7 +98,7 @@ my_tile(uint tx, uint ty)
 static INLINE void
 get_cz_tiles(uint tx, uint ty)
 {
-   if (spu.read_depth) {
+   if (spu.read_depth_stencil) {
       if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
          //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty);
          get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
@@ -153,7 +153,7 @@ static INLINE void
 wait_put_cz_tiles(void)
 {
    wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
-   if (spu.read_depth) {
+   if (spu.read_depth_stencil) {
       wait_on_mask(1 << TAG_WRITE_TILE_Z);
    }
 }
@@ -175,22 +175,14 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    const ubyte *vertices;
    const ushort *indexes;
    uint i, j;
+   uint num_tiles;
 
-
-   if (Debug) {
-      printf("SPU %u: RENDER prim %u, num_vert=%u  num_ind=%u  "
-             "inline_vert=%u\n",
-             spu.init.id,
-             render->prim_type,
-             render->num_verts,
-             render->num_indexes,
-             render->inline_verts);
-
-      /*
-      printf("       bound: %g, %g .. %g, %g\n",
-             render->xmin, render->ymin, render->xmax, render->ymax);
-      */
-   }
+   D_PRINTF(CELL_DEBUG_CMD,
+            "RENDER prim=%u num_vert=%u num_ind=%u inline_vert=%u\n",
+            render->prim_type,
+            render->num_verts,
+            render->num_indexes,
+            render->inline_verts);
 
    ASSERT(sizeof(*render) % 4 == 0);
    ASSERT(total_vertex_bytes % 16 == 0);
@@ -251,6 +243,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
 
 
+   num_tiles = 0;
+
    /**
     ** loop over tiles, rendering tris
     **/
@@ -264,6 +258,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       if (!my_tile(tx, ty))
          continue;
 
+      num_tiles++;
+
       spu.cur_ctile_status = spu.ctile_status[ty][tx];
       spu.cur_ztile_status = spu.ztile_status[ty][tx];
 
@@ -293,9 +289,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       spu.ztile_status[ty][tx] = spu.cur_ztile_status;
    }
 
-   if (Debug)
-      printf("SPU %u: RENDER done\n",
-             spu.init.id);
+   D_PRINTF(CELL_DEBUG_CMD,
+            "RENDER done (%u tiles hit)\n",
+            num_tiles);
 }
-
-
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 117b8a36f80..69784c89788 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -26,6 +26,8 @@
  **************************************************************************/
 
 
+#include <math.h>
+
 #include "pipe/p_compiler.h"
 #include "spu_main.h"
 #include "spu_texture.h"
@@ -40,37 +42,19 @@
 void
 invalidate_tex_cache(void)
 {
-   uint unit = 0;
-   uint bytes = 4 * spu.texture[unit].width
-      * spu.texture[unit].height;
-
-   spu_dcache_mark_dirty((unsigned) spu.texture[unit].start, bytes);
-}
+   uint lvl;
+   for (lvl = 0; lvl < CELL_MAX_TEXTURE_LEVELS; lvl++) {
+      uint unit = 0;
+      uint bytes = 4 * spu.texture[unit].level[lvl].width
+         * spu.texture[unit].level[lvl].height;
 
+      if (spu.texture[unit].target == PIPE_TEXTURE_CUBE)
+         bytes *= 6;
+      else if (spu.texture[unit].target == PIPE_TEXTURE_3D)
+         bytes *= spu.texture[unit].level[lvl].depth;
 
-/**
- * XXX look into getting texels for all four pixels in a quad at once.
- */
-static uint
-get_texel(uint unit, vec_uint4 coordinate)
-{
-   /*
-    * XXX we could do the "/ TILE_SIZE" and "% TILE_SIZE" operations as
-    * SIMD since X and Y are already in a SIMD register.
-    */
-   const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
-   ushort x = spu_extract(coordinate, 0);
-   ushort y = spu_extract(coordinate, 1);
-   unsigned tile_offset = sizeof(tile_t)
-      * ((y / TILE_SIZE * spu.texture[unit].tiles_per_row) + (x / TILE_SIZE));
-   ushort texel_offset = (ushort) 4
-      * (ushort) (((ushort) (y % TILE_SIZE) * (ushort) TILE_SIZE) + (x % TILE_SIZE));
-   vec_uint4 tmp;
-
-   spu_dcache_fetch_unaligned((qword *) & tmp,
-                              texture_ea + tile_offset + texel_offset,
-                              4);
-   return spu_extract(tmp, 0);
+      spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
+   }
 }
 
 
@@ -88,15 +72,17 @@ get_texel(uint unit, vec_uint4 coordinate)
  * a time.
  */
 static void
-get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
+get_four_texels(const struct spu_texture_level *tlevel, uint face,
+                vec_int4 x, vec_int4 y,
+                vec_uint4 *texels)
 {
-   const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
-   vec_uint4 tile_x = spu_rlmask(x, -5);
-   vec_uint4 tile_y = spu_rlmask(y, -5);
-   const qword offset_x = si_andi((qword) x, 0x1f);
-   const qword offset_y = si_andi((qword) y, 0x1f);
+   unsigned texture_ea = (uintptr_t) tlevel->start;
+   const vec_int4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
+   const vec_int4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
+   const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
+   const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
 
-   const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].tiles_per_row);
+   const qword tiles_per_row = (qword) spu_splats(tlevel->tiles_per_row);
    const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));
 
    qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
@@ -107,6 +93,8 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
    
    vec_uint4 offset = (vec_uint4) si_a(tile_offset, texel_offset);
    
+   texture_ea = texture_ea + face * tlevel->bytes_per_image;
+
    spu_dcache_fetch_unaligned((qword *) & texels[0],
                               texture_ea + spu_extract(offset, 0), 4);
    spu_dcache_fetch_unaligned((qword *) & texels[1],
@@ -118,83 +106,536 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 }
 
 
+/** clamp vec to [0, max] */
+static INLINE vector signed int
+spu_clamp(vector signed int vec, vector signed int max)
+{
+   static const vector signed int zero = {0,0,0,0};
+   vector unsigned int c;
+   c = spu_cmpgt(vec, zero);    /* c = vec > zero ? ~0 : 0 */
+   vec = spu_sel(zero, vec, c);
+   c = spu_cmpgt(vec, max);    /* c = vec > max ? ~0 : 0 */
+   vec = spu_sel(vec, max, c);
+   return vec;
+}
+
+
+
 /**
- * Get texture sample at texcoord.
+ * Do nearest texture sampling for four pixels.
+ * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
  */
-vector float
-sample_texture_nearest(uint unit, vector float texcoord)
+void
+sample_texture_2d_nearest(vector float s, vector float t,
+                          uint unit, uint level, uint face,
+                          vector float colors[4])
 {
-   vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
-   vector unsigned int itc = spu_convtu(tc, 0);  /* convert to int */
-   itc = spu_and(itc, spu.texture[unit].tex_size_mask); /* mask (GL_REPEAT) */
-   uint texel = get_texel(unit, itc);
-   return spu_unpack_A8R8G8B8(texel);
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   vector float ss = spu_mul(s, tlevel->scale_s);
+   vector float tt = spu_mul(t, tlevel->scale_t);
+   vector signed int is = spu_convts(ss, 0);
+   vector signed int it = spu_convts(tt, 0);
+   vec_uint4 texels[4];
+
+   /* PIPE_TEX_WRAP_REPEAT */
+   is = spu_and(is, tlevel->mask_s);
+   it = spu_and(it, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is = spu_clamp(is, tlevel->max_s);
+   it = spu_clamp(it, tlevel->max_t);
+
+   get_four_texels(tlevel, face, is, it, texels);
+
+   /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
+   spu_unpack_A8R8G8B8_transpose4(texels, colors);
 }
 
 
-vector float
-sample_texture_bilinear(uint unit, vector float texcoord)
+/**
+ * Do bilinear texture sampling for four pixels.
+ * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
+ */
+void
+sample_texture_2d_bilinear(vector float s, vector float t,
+                           uint unit, uint level, uint face,
+                           vector float colors[4])
 {
-   static const vec_uint4 offset_x = {0, 0, 1, 1};
-   static const vec_uint4 offset_y = {0, 1, 0, 1};
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
 
-   vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
-   tc = spu_add(tc, spu_splats(-0.5f));  /* half texel bias */
+   vector float ss = spu_madd(s, tlevel->scale_s, half);
+   vector float tt = spu_madd(t, tlevel->scale_t, half);
 
-   /* integer texcoords S,T: */
-   vec_uint4 itc = spu_convtu(tc, 0);  /* convert to int */
+   vector signed int is0 = spu_convts(ss, 0);
+   vector signed int it0 = spu_convts(tt, 0);
 
-   vec_uint4 texels[4];
-   
-   /* setup texcoords for quad:
-    *  +-----+-----+
-    *  |x0,y0|x1,y1|
-    *  +-----+-----+
-    *  |x2,y2|x3,y3|
-    *  +-----+-----+
-    */
-   vec_uint4 x = spu_splats(spu_extract(itc, 0));
-   vec_uint4 y = spu_splats(spu_extract(itc, 1));
-   x = spu_add(x, offset_x);
-   y = spu_add(y, offset_y);
+   /* is + 1, it + 1 */
+   vector signed int is1 = spu_add(is0, 1);
+   vector signed int it1 = spu_add(it0, 1);
 
-   /* GL_REPEAT wrap mode: */
-   x = spu_and(x, spu.texture[unit].tex_size_x_mask);
-   y = spu_and(y, spu.texture[unit].tex_size_y_mask);
+   /* PIPE_TEX_WRAP_REPEAT */
+   is0 = spu_and(is0, tlevel->mask_s);
+   it0 = spu_and(it0, tlevel->mask_t);
+   is1 = spu_and(is1, tlevel->mask_s);
+   it1 = spu_and(it1, tlevel->mask_t);
 
-   get_four_texels(unit, x, y, texels);
+   /* PIPE_TEX_WRAP_CLAMP */
+   is0 = spu_clamp(is0, tlevel->max_s);
+   it0 = spu_clamp(it0, tlevel->max_t);
+   is1 = spu_clamp(is1, tlevel->max_s);
+   it1 = spu_clamp(it1, tlevel->max_t);
 
-   /* integer A8R8G8B8 to float texel conversion */
-   vector float texel00 = spu_unpack_A8R8G8B8(spu_extract(texels[0], 0));
-   vector float texel01 = spu_unpack_A8R8G8B8(spu_extract(texels[1], 0));
-   vector float texel10 = spu_unpack_A8R8G8B8(spu_extract(texels[2], 0));
-   vector float texel11 = spu_unpack_A8R8G8B8(spu_extract(texels[3], 0));
+   /* get packed int texels */
+   vector unsigned int texels[16];
+   get_four_texels(tlevel, face, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(tlevel, face, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(tlevel, face, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(tlevel, face, is1, it1, texels + 12); /* lower-right */
 
+   /* convert packed int texels to float colors */
+   vector float ftexels[16];
+   spu_unpack_A8R8G8B8_transpose4(texels + 0, ftexels + 0);
+   spu_unpack_A8R8G8B8_transpose4(texels + 4, ftexels + 4);
+   spu_unpack_A8R8G8B8_transpose4(texels + 8, ftexels + 8);
+   spu_unpack_A8R8G8B8_transpose4(texels + 12, ftexels + 12);
 
    /* Compute weighting factors in [0,1]
     * Multiply texcoord by 1024, AND with 1023, convert back to float.
     */
-   vector float tc1024 = spu_mul(tc, spu_splats(1024.0f));
-   vector signed int itc1024 = spu_convts(tc1024, 0);
-   itc1024 = spu_and(itc1024, spu_splats((1 << 10) - 1));
-   vector float weight = spu_convtf(itc1024, 10);
-
-   /* smeared frac and 1-frac */
-   vector float sfrac = spu_splats(spu_extract(weight, 0));
-   vector float tfrac = spu_splats(spu_extract(weight, 1));
-   vector float sfrac1 = spu_sub(spu_splats(1.0f), sfrac);
-   vector float tfrac1 = spu_sub(spu_splats(1.0f), tfrac);
-
-   /* multiply the samples (colors) by the S/T weights */
-   texel00 = spu_mul(spu_mul(texel00, sfrac1), tfrac1);
-   texel10 = spu_mul(spu_mul(texel10, sfrac ), tfrac1);
-   texel01 = spu_mul(spu_mul(texel01, sfrac1), tfrac );
-   texel11 = spu_mul(spu_mul(texel11, sfrac ), tfrac );
-
-   /* compute sum of weighted samples */
-   vector float texel_sum = spu_add(texel00, texel01);
-   texel_sum = spu_add(texel_sum, texel10);
-   texel_sum = spu_add(texel_sum, texel11);
-
-   return texel_sum;
+   vector float ss1024 = spu_mul(ss, spu_splats(1024.0f));
+   vector signed int iss1024 = spu_convts(ss1024, 0);
+   iss1024 = spu_and(iss1024, 1023);
+   vector float sWeights0 = spu_convtf(iss1024, 10);
+
+   vector float tt1024 = spu_mul(tt, spu_splats(1024.0f));
+   vector signed int itt1024 = spu_convts(tt1024, 0);
+   itt1024 = spu_and(itt1024, 1023);
+   vector float tWeights0 = spu_convtf(itt1024, 10);
+
+   /* 1 - sWeight and 1 - tWeight */
+   vector float sWeights1 = spu_sub(spu_splats(1.0f), sWeights0);
+   vector float tWeights1 = spu_sub(spu_splats(1.0f), tWeights0);
+
+   /* reds, for four pixels */
+   ftexels[ 0] = spu_mul(ftexels[ 0], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 4] = spu_mul(ftexels[ 4], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[ 8] = spu_mul(ftexels[ 8], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[12] = spu_mul(ftexels[12], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[0] = spu_add(spu_add(ftexels[0], ftexels[4]),
+                       spu_add(ftexels[8], ftexels[12]));
+
+   /* greens, for four pixels */
+   ftexels[ 1] = spu_mul(ftexels[ 1], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 5] = spu_mul(ftexels[ 5], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[ 9] = spu_mul(ftexels[ 9], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[13] = spu_mul(ftexels[13], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[1] = spu_add(spu_add(ftexels[1], ftexels[5]),
+                       spu_add(ftexels[9], ftexels[13]));
+
+   /* blues, for four pixels */
+   ftexels[ 2] = spu_mul(ftexels[ 2], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 6] = spu_mul(ftexels[ 6], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[10] = spu_mul(ftexels[10], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[14] = spu_mul(ftexels[14], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[2] = spu_add(spu_add(ftexels[2], ftexels[6]),
+                       spu_add(ftexels[10], ftexels[14]));
+
+   /* alphas, for four pixels */
+   ftexels[ 3] = spu_mul(ftexels[ 3], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 7] = spu_mul(ftexels[ 7], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[11] = spu_mul(ftexels[11], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[15] = spu_mul(ftexels[15], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[3] = spu_add(spu_add(ftexels[3], ftexels[7]),
+                       spu_add(ftexels[11], ftexels[15]));
+}
+
+
+
+/**
+ * Adapted from /opt/cell/sdk/usr/spu/include/transpose_matrix4x4.h
+ */
+static INLINE void
+transpose(vector unsigned int *mOut0,
+          vector unsigned int *mOut1,
+          vector unsigned int *mOut2,
+          vector unsigned int *mOut3,
+          vector unsigned int *mIn)
+{
+  vector unsigned int abcd, efgh, ijkl, mnop;	/* input vectors */
+  vector unsigned int aeim, bfjn, cgko, dhlp;	/* output vectors */
+  vector unsigned int aibj, ckdl, emfn, gohp;	/* intermediate vectors */
+
+  vector unsigned char shufflehi = ((vector unsigned char) {
+					       0x00, 0x01, 0x02, 0x03,
+					       0x10, 0x11, 0x12, 0x13,
+					       0x04, 0x05, 0x06, 0x07,
+					       0x14, 0x15, 0x16, 0x17});
+  vector unsigned char shufflelo = ((vector unsigned char) {
+					       0x08, 0x09, 0x0A, 0x0B,
+					       0x18, 0x19, 0x1A, 0x1B,
+					       0x0C, 0x0D, 0x0E, 0x0F,
+					       0x1C, 0x1D, 0x1E, 0x1F});
+  abcd = *(mIn+0);
+  efgh = *(mIn+1);
+  ijkl = *(mIn+2);
+  mnop = *(mIn+3);
+
+  aibj = spu_shuffle(abcd, ijkl, shufflehi);
+  ckdl = spu_shuffle(abcd, ijkl, shufflelo);
+  emfn = spu_shuffle(efgh, mnop, shufflehi);
+  gohp = spu_shuffle(efgh, mnop, shufflelo);
+
+  aeim = spu_shuffle(aibj, emfn, shufflehi);
+  bfjn = spu_shuffle(aibj, emfn, shufflelo);
+  cgko = spu_shuffle(ckdl, gohp, shufflehi);
+  dhlp = spu_shuffle(ckdl, gohp, shufflelo);
+
+  *mOut0 = aeim;
+  *mOut1 = bfjn;
+  *mOut2 = cgko;
+  *mOut3 = dhlp;
+}
+
+
+/**
+ * Bilinear filtering, using int instead of float arithmetic for computing
+ * sample weights.
+ */
+void
+sample_texture_2d_bilinear_int(vector float s, vector float t,
+                               uint unit, uint level, uint face,
+                               vector float colors[4])
+{
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
+
+   /* Scale texcoords by size of texture, and add half pixel bias */
+   vector float ss = spu_madd(s, tlevel->scale_s, half);
+   vector float tt = spu_madd(t, tlevel->scale_t, half);
+
+   /* convert float coords to fixed-pt coords with 7 fraction bits */
+   vector signed int is = spu_convts(ss, 7);  /* XXX really need floor() here */
+   vector signed int it = spu_convts(tt, 7);  /* XXX really need floor() here */
+
+   /* compute integer texel weights in [0, 127] */
+   vector signed int sWeights0 = spu_and(is, 127);
+   vector signed int tWeights0 = spu_and(it, 127);
+   vector signed int sWeights1 = spu_sub(127, sWeights0);
+   vector signed int tWeights1 = spu_sub(127, tWeights0);
+
+   /* texel coords: is0 = is / 128, it0 = is / 128 */
+   vector signed int is0 = spu_rlmask(is, -7);
+   vector signed int it0 = spu_rlmask(it, -7);
+
+   /* texel coords: i1 = is0 + 1, it1 = it0 + 1 */
+   vector signed int is1 = spu_add(is0, 1);
+   vector signed int it1 = spu_add(it0, 1);
+
+   /* PIPE_TEX_WRAP_REPEAT */
+   is0 = spu_and(is0, tlevel->mask_s);
+   it0 = spu_and(it0, tlevel->mask_t);
+   is1 = spu_and(is1, tlevel->mask_s);
+   it1 = spu_and(it1, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is0 = spu_clamp(is0, tlevel->max_s);
+   it0 = spu_clamp(it0, tlevel->max_t);
+   is1 = spu_clamp(is1, tlevel->max_s);
+   it1 = spu_clamp(it1, tlevel->max_t);
+
+   /* get packed int texels */
+   vector unsigned int texels[16];
+   get_four_texels(tlevel, face, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(tlevel, face, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(tlevel, face, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(tlevel, face, is1, it1, texels + 12); /* lower-right */
+
+   /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
+   {
+      static const unsigned char ZERO = 0x80;
+      int i;
+      for (i = 0; i < 16; i++) {
+         texels[i] = spu_shuffle(texels[i], texels[i],
+                                 ((vector unsigned char) {
+                                    ZERO, ZERO, ZERO, 1,
+                                    ZERO, ZERO, ZERO, 2,
+                                    ZERO, ZERO, ZERO, 3,
+                                    ZERO, ZERO, ZERO, 0}));
+      }
+   }
+
+   /* convert RGBA,RGBA,RGBA,RGBA to RRRR,GGGG,BBBB,AAAA */
+   vector unsigned int texel0, texel1, texel2, texel3, texel4, texel5, texel6, texel7,
+      texel8, texel9, texel10, texel11, texel12, texel13, texel14, texel15;
+   transpose(&texel0, &texel1, &texel2, &texel3, texels + 0);
+   transpose(&texel4, &texel5, &texel6, &texel7, texels + 4);
+   transpose(&texel8, &texel9, &texel10, &texel11, texels + 8);
+   transpose(&texel12, &texel13, &texel14, &texel15, texels + 12);
+
+   /* computed weighted colors */
+   vector unsigned int c0, c1, c2, c3, cSum;
+
+   /* red */
+   c0 = (vector unsigned int) si_mpy((qword) texel0, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel4, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel8, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel12, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[0] = spu_convtf(cSum, 22);
+
+   /* green */
+   c0 = (vector unsigned int) si_mpy((qword) texel1, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel5, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel9, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel13, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[1] = spu_convtf(cSum, 22);
+
+   /* blue */
+   c0 = (vector unsigned int) si_mpy((qword) texel2, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel6, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel10, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel14, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[2] = spu_convtf(cSum, 22);
+
+   /* alpha */
+   c0 = (vector unsigned int) si_mpy((qword) texel3, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel7, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel11, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel15, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[3] = spu_convtf(cSum, 22);
+}
+
+
+
+/**
+ * Compute level of detail factor from texcoords.
+ */
+static INLINE float
+compute_lambda_2d(uint unit, vector float s, vector float t)
+{
+   uint baseLevel = 0;
+   float width = spu.texture[unit].level[baseLevel].width;
+   float height = spu.texture[unit].level[baseLevel].width;
+   float dsdx = width * (spu_extract(s, 1) - spu_extract(s, 0));
+   float dsdy = width * (spu_extract(s, 2) - spu_extract(s, 0));
+   float dtdx = height * (spu_extract(t, 1) - spu_extract(t, 0));
+   float dtdy = height * (spu_extract(t, 2) - spu_extract(t, 0));
+#if 0
+   /* ideal value */
+   float x = dsdx * dsdx + dtdx * dtdx;
+   float y = dsdy * dsdy + dtdy * dtdy;
+   float rho = x > y ? x : y;
+   rho = sqrtf(rho);
+#else
+   /* approximation */
+   dsdx = fabsf(dsdx);
+   dsdy = fabsf(dsdy);
+   dtdx = fabsf(dtdx);
+   dtdy = fabsf(dtdy);
+   float rho = (dsdx + dsdy + dtdx + dtdy) * 0.5;
+#endif
+   float lambda = logf(rho) * 1.442695f; /* compute logbase2(rho) */
+   return lambda;
+}
+
+
+/**
+ * Blend two sets of colors according to weight.
+ */
+static void
+blend_colors(vector float c0[4], const vector float c1[4], float weight)
+{
+   vector float t = spu_splats(weight);
+   vector float dc0 = spu_sub(c1[0], c0[0]);
+   vector float dc1 = spu_sub(c1[1], c0[1]);
+   vector float dc2 = spu_sub(c1[2], c0[2]);
+   vector float dc3 = spu_sub(c1[3], c0[3]);
+   c0[0] = spu_madd(dc0, t, c0[0]);
+   c0[1] = spu_madd(dc1, t, c0[1]);
+   c0[2] = spu_madd(dc2, t, c0[2]);
+   c0[3] = spu_madd(dc3, t, c0[3]);
+}
+
+
+/**
+ * Texture sampling with level of detail selection and possibly mipmap
+ * interpolation.
+ */
+void
+sample_texture_2d_lod(vector float s, vector float t,
+                      uint unit, uint level_ignored, uint face,
+                      vector float colors[4])
+{
+   /*
+    * Note that we're computing a lambda/lod here that's used for all
+    * four pixels in the quad.
+    */
+   float lambda = compute_lambda_2d(unit, s, t);
+
+   (void) face;
+   (void) level_ignored;
+
+   /* apply lod bias */
+   lambda += spu.sampler[unit].lod_bias;
+
+   /* clamp */
+   if (lambda < spu.sampler[unit].min_lod)
+      lambda = spu.sampler[unit].min_lod;
+   else if (lambda > spu.sampler[unit].max_lod)
+      lambda = spu.sampler[unit].max_lod;
+
+   if (lambda <= 0.0f) {
+      /* magnify */
+      spu.mag_sample_texture_2d[unit](s, t, unit, 0, face, colors);
+   }
+   else {
+      /* minify */
+      if (spu.sampler[unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
+         /* sample two mipmap levels and interpolate */
+         int level = (int) lambda;
+         if (level > (int) spu.texture[unit].max_level)
+            level = spu.texture[unit].max_level;
+         spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors);
+         if (spu.sampler[unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
+            /* sample second mipmap level */
+            float weight = lambda - (float) level;
+            level++;
+            if (level <= (int) spu.texture[unit].max_level) {
+               vector float colors2[4];
+               spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors2);
+               blend_colors(colors, colors2, weight);
+            }
+         }
+      }
+      else {
+         /* sample one mipmap level */
+         int level = (int) (lambda + 0.5f);
+         if (level > (int) spu.texture[unit].max_level)
+            level = spu.texture[unit].max_level;
+         spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors);
+      }
+   }
+}
+
+
+/** XXX need a SIMD version of this */
+static unsigned
+choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
+{
+   /*
+      major axis
+      direction     target                             sc     tc    ma
+      ----------    -------------------------------    ---    ---   ---
+       +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
+       -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
+       +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
+       -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
+       +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
+       -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
+   */
+   const float arx = fabsf(rx);
+   const float ary = fabsf(ry);
+   const float arz = fabsf(rz);
+   unsigned face;
+   float sc, tc, ma;
+
+   if (arx > ary && arx > arz) {
+      if (rx >= 0.0F) {
+         face = PIPE_TEX_FACE_POS_X;
+         sc = -rz;
+         tc = -ry;
+         ma = arx;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_X;
+         sc = rz;
+         tc = -ry;
+         ma = arx;
+      }
+   }
+   else if (ary > arx && ary > arz) {
+      if (ry >= 0.0F) {
+         face = PIPE_TEX_FACE_POS_Y;
+         sc = rx;
+         tc = rz;
+         ma = ary;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_Y;
+         sc = rx;
+         tc = -rz;
+         ma = ary;
+      }
+   }
+   else {
+      if (rz > 0.0F) {
+         face = PIPE_TEX_FACE_POS_Z;
+         sc = rx;
+         tc = -ry;
+         ma = arz;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_Z;
+         sc = -rx;
+         tc = -ry;
+         ma = arz;
+      }
+   }
+
+   *newS = (sc / ma + 1.0F) * 0.5F;
+   *newT = (tc / ma + 1.0F) * 0.5F;
+
+   return face;
+}
+
+
+
+void
+sample_texture_cube(vector float s, vector float t, vector float r,
+                    uint unit, vector float colors[4])
+{
+   uint p, faces[4], level = 0;
+   float newS[4], newT[4];
+
+   /* Compute cube faces referenced by the four sets of texcoords.
+    * XXX we should SIMD-ize this.
+    */
+   for (p = 0; p < 4; p++) {      
+      float rx = spu_extract(s, p);
+      float ry = spu_extract(t, p);
+      float rz = spu_extract(r, p);
+      faces[p] = choose_cube_face(rx, ry, rz, &newS[p], &newT[p]);
+   }
+
+   if (faces[0] == faces[1] &&
+       faces[0] == faces[2] &&
+       faces[0] == faces[3]) {
+      /* GOOD!  All four texcoords refer to the same cube face */
+      s = (vector float) {newS[0], newS[1], newS[2], newS[3]};
+      t = (vector float) {newT[0], newT[1], newT[2], newT[3]};
+      spu.sample_texture_2d[unit](s, t, unit, level, faces[0], colors);
+   }
+   else {
+      /* BAD!  The four texcoords refer to different faces */
+      for (p = 0; p < 4; p++) {      
+         vector float c[4];
+
+         spu.sample_texture_2d[unit](spu_splats(newS[p]), spu_splats(newT[p]),
+                                     unit, level, faces[p], c);
+
+         float red = spu_extract(c[0], p);
+         float green = spu_extract(c[1], p);
+         float blue = spu_extract(c[2], p);
+         float alpha = spu_extract(c[3], p);
+
+         colors[0] = spu_insert(red,   colors[0], p);
+         colors[1] = spu_insert(green, colors[1], p);
+         colors[2] = spu_insert(blue,  colors[2], p);
+         colors[3] = spu_insert(alpha, colors[3], p);
+      }
+   }
 }
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index f7c9738be88..7b75b007b5a 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -36,12 +36,32 @@ extern void
 invalidate_tex_cache(void);
 
 
-extern vector float
-sample_texture_nearest(uint unit, vector float texcoord);
+extern void
+sample_texture_2d_nearest(vector float s, vector float t,
+                          uint unit, uint level, uint face,
+                          vector float colors[4]);
+
+
+extern void
+sample_texture_2d_bilinear(vector float s, vector float t,
+                           uint unit, uint level, uint face,
+                           vector float colors[4]);
+
+extern void
+sample_texture_2d_bilinear_int(vector float s, vector float t,
+                               uint unit, uint level, uint face,
+                               vector float colors[4]);
+
 
+extern void
+sample_texture_2d_lod(vector float s, vector float t,
+                      uint unit, uint level, uint face,
+                      vector float colors[4]);
 
-extern vector float
-sample_texture_bilinear(uint unit, vector float texcoord);
+
+extern void
+sample_texture_cube(vector float s, vector float t, vector float r,
+                    uint unit, vector float colors[4]);
 
 
 #endif /* SPU_TEXTURE_H */
diff --git a/src/gallium/drivers/cell/spu/spu_tile.c b/src/gallium/drivers/cell/spu/spu_tile.c
index 216a33126b7..6905015a483 100644
--- a/src/gallium/drivers/cell/spu/spu_tile.c
+++ b/src/gallium/drivers/cell/spu/spu_tile.c
@@ -87,3 +87,40 @@ put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf)
            0  /* rid */);
 }
 
+
+/**
+ * For tiles whose status is TILE_STATUS_CLEAR, write solid-filled
+ * tiles back to the main framebuffer.
+ */
+void
+really_clear_tiles(uint surfaceIndex)
+{
+   const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+   uint i;
+
+   if (surfaceIndex == 0) {
+      clear_c_tile(&spu.ctile);
+
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (spu.ctile_status[ty][tx] == TILE_STATUS_CLEAR) {
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
+         }
+      }
+   }
+   else {
+      clear_z_tile(&spu.ztile);
+
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (spu.ztile_status[ty][tx] == TILE_STATUS_CLEAR)
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 1);
+      }
+   }
+
+#if 0
+   wait_on_mask(1 << TAG_SURFACE_CLEAR);
+#endif
+}
diff --git a/src/gallium/drivers/cell/spu/spu_tile.h b/src/gallium/drivers/cell/spu/spu_tile.h
index 1b5491112db..7bfb52be8f3 100644
--- a/src/gallium/drivers/cell/spu/spu_tile.h
+++ b/src/gallium/drivers/cell/spu/spu_tile.h
@@ -36,12 +36,14 @@
 
 
 
-void
+extern void
 get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf);
 
-void
+extern void
 put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf);
 
+extern void
+really_clear_tiles(uint surfaceIndex);
 
 
 static INLINE void
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 8b938781920..22e51a86ae5 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -43,11 +43,6 @@
 /** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
 typedef vector unsigned int mask_t;
 
-typedef union
-{
-   vector float v;
-   float f[4];
-} float4;
 
 
 /**
@@ -91,9 +86,9 @@ struct edge {
 
 struct interp_coef
 {
-   float4 a0;
-   float4 dadx;
-   float4 dady;
+   vector float a0;
+   vector float dadx;
+   vector float dady;
 };
 
 
@@ -116,21 +111,15 @@ struct setup_stage {
    struct edge etop;
    struct edge emaj;
 
-   float oneoverarea;
+   float oneOverArea;  /* XXX maybe make into vector? */
+
+   uint facing;
 
-   uint tx, ty;
+   uint tx, ty;  /**< position of current tile (x, y) */
 
    int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
 
-#if 0
-   struct tgsi_interp_coef coef[PIPE_MAX_SHADER_INPUTS];
-#else
    struct interp_coef coef[PIPE_MAX_SHADER_INPUTS];
-#endif
-
-#if 0
-   struct quad_header quad; 
-#endif
 
    struct {
       int left[2];   /**< [0] = row0, [1] = row1 */
@@ -142,118 +131,105 @@ struct setup_stage {
 };
 
 
-
 static struct setup_stage setup;
 
 
-
-
-#if 0
-/**
- * Basically a cast wrapper.
- */
-static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
-{
-   return (struct setup_stage *)stage;
-}
-#endif
-
-#if 0
-/**
- * Clip setup.quad against the scissor/surface bounds.
- */
-static INLINE void
-quad_clip(struct setup_stage *setup)
-{
-   const struct pipe_scissor_state *cliprect = &setup.softpipe->cliprect;
-   const int minx = (int) cliprect->minx;
-   const int maxx = (int) cliprect->maxx;
-   const int miny = (int) cliprect->miny;
-   const int maxy = (int) cliprect->maxy;
-
-   if (setup.quad.x0 >= maxx ||
-       setup.quad.y0 >= maxy ||
-       setup.quad.x0 + 1 < minx ||
-       setup.quad.y0 + 1 < miny) {
-      /* totally clipped */
-      setup.quad.mask = 0x0;
-      return;
-   }
-   if (setup.quad.x0 < minx)
-      setup.quad.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
-   if (setup.quad.y0 < miny)
-      setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
-   if (setup.quad.x0 == maxx - 1)
-      setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
-   if (setup.quad.y0 == maxy - 1)
-      setup.quad.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
-}
-#endif
-
-#if 0
-/**
- * Emit a quad (pass to next stage) with clipping.
- */
-static INLINE void
-clip_emit_quad(struct setup_stage *setup)
-{
-   quad_clip(setup);
-   if (setup.quad.mask) {
-      struct softpipe_context *sp = setup.softpipe;
-      sp->quad.first->run(sp->quad.first, &setup.quad);
-   }
-}
-#endif
-
 /**
  * Evaluate attribute coefficients (plane equations) to compute
  * attribute values for the four fragments in a quad.
  * Eg: four colors will be computed (in AoS format).
  */
 static INLINE void
-eval_coeff(uint slot, float x, float y, vector float result[4])
+eval_coeff(uint slot, float x, float y, vector float w, vector float result[4])
 {
-   switch (spu.vertex_info.interp_mode[slot]) {
+   switch (spu.vertex_info.attrib[slot].interp_mode) {
    case INTERP_CONSTANT:
       result[QUAD_TOP_LEFT] =
       result[QUAD_TOP_RIGHT] =
       result[QUAD_BOTTOM_LEFT] =
-      result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0.v;
+      result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0;
       break;
-
    case INTERP_LINEAR:
-      /* fall-through, for now */
-   default:
       {
-         register vector float dadx = setup.coef[slot].dadx.v;
-         register vector float dady = setup.coef[slot].dady.v;
-         register vector float topLeft
-            = spu_add(setup.coef[slot].a0.v,
-                      spu_add(spu_mul(spu_splats(x), dadx),
-                              spu_mul(spu_splats(y), dady)));
+         vector float dadx = setup.coef[slot].dadx;
+         vector float dady = setup.coef[slot].dady;
+         vector float topLeft =
+            spu_add(setup.coef[slot].a0,
+                    spu_add(spu_mul(spu_splats(x), dadx),
+                            spu_mul(spu_splats(y), dady)));
 
          result[QUAD_TOP_LEFT] = topLeft;
          result[QUAD_TOP_RIGHT] = spu_add(topLeft, dadx);
          result[QUAD_BOTTOM_LEFT] = spu_add(topLeft, dady);
          result[QUAD_BOTTOM_RIGHT] = spu_add(spu_add(topLeft, dadx), dady);
       }
+      break;
+   case INTERP_PERSPECTIVE:
+      {
+         vector float dadx = setup.coef[slot].dadx;
+         vector float dady = setup.coef[slot].dady;
+         vector float topLeft =
+            spu_add(setup.coef[slot].a0,
+                    spu_add(spu_mul(spu_splats(x), dadx),
+                            spu_mul(spu_splats(y), dady)));
+
+         vector float wInv = spu_re(w);  /* 1.0 / w */
+
+         result[QUAD_TOP_LEFT] = spu_mul(topLeft, wInv);
+         result[QUAD_TOP_RIGHT] = spu_mul(spu_add(topLeft, dadx), wInv);
+         result[QUAD_BOTTOM_LEFT] = spu_mul(spu_add(topLeft, dady), wInv);
+         result[QUAD_BOTTOM_RIGHT] = spu_mul(spu_add(spu_add(topLeft, dadx), dady), wInv);
+      }
+      break;
+   case INTERP_POS:
+   case INTERP_NONE:
+      break;
+   default:
+      ASSERT(0);
    }
 }
 
 
+/**
+ * As above, but return 4 vectors in SOA format.
+ * XXX this will all be re-written someday.
+ */
+static INLINE void
+eval_coeff_soa(uint slot, float x, float y, vector float w, vector float result[4])
+{
+   eval_coeff(slot, x, y, w, result);
+   _transpose_matrix4x4(result, result);
+}
+
+
+/** Evalute coefficients to get Z for four pixels in a quad */
 static INLINE vector float
 eval_z(float x, float y)
 {
    const uint slot = 0;
-   const float dzdx = setup.coef[slot].dadx.f[2];
-   const float dzdy = setup.coef[slot].dady.f[2];
-   const float topLeft = setup.coef[slot].a0.f[2] + x * dzdx + y * dzdy;
+   const float dzdx = spu_extract(setup.coef[slot].dadx, 2);
+   const float dzdy = spu_extract(setup.coef[slot].dady, 2);
+   const float topLeft = spu_extract(setup.coef[slot].a0, 2) + x * dzdx + y * dzdy;
    const vector float topLeftv = spu_splats(topLeft);
    const vector float derivs = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
    return spu_add(topLeftv, derivs);
 }
 
 
+/** Evalute coefficients to get W for four pixels in a quad */
+static INLINE vector float
+eval_w(float x, float y)
+{
+   const uint slot = 0;
+   const float dwdx = spu_extract(setup.coef[slot].dadx, 3);
+   const float dwdy = spu_extract(setup.coef[slot].dady, 3);
+   const float topLeft = spu_extract(setup.coef[slot].a0, 3) + x * dwdx + y * dwdy;
+   const vector float topLeftv = spu_splats(topLeft);
+   const vector float derivs = (vector float) { 0.0, dwdx, dwdy, dwdx + dwdy };
+   return spu_add(topLeftv, derivs);
+}
+
+
 /**
  * Emit a quad (pass to next stage).  No clipping is done.
  * Note: about 1/5 to 1/7 of the time, mask is zero and this function
@@ -261,120 +237,59 @@ eval_z(float x, float y)
  * overall.
  */
 static INLINE void
-emit_quad( int x, int y, mask_t mask )
+emit_quad( int x, int y, mask_t mask)
 {
    /* If any bits in mask are set... */
    if (spu_extract(spu_orx(mask), 0)) {
       const int ix = x - setup.cliprect_minx;
       const int iy = y - setup.cliprect_miny;
-      vector float colors[4];
 
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
       spu.cur_ztile_status = TILE_STATUS_DIRTY;
 
-      if (spu.texture[0].start) {
-         /* texture mapping */
-         const uint unit = 0;
-         vector float texcoords[4];
-         eval_coeff(2, (float) x, (float) y, texcoords);
-
-         if (spu_extract(mask, 0))
-            colors[0] = spu.sample_texture[unit](unit, texcoords[0]);
-         if (spu_extract(mask, 1))
-            colors[1] = spu.sample_texture[unit](unit, texcoords[1]);
-         if (spu_extract(mask, 2))
-            colors[2] = spu.sample_texture[unit](unit, texcoords[2]);
-         if (spu_extract(mask, 3))
-            colors[3] = spu.sample_texture[unit](unit, texcoords[3]);
-
-
-         if (spu.texture[1].start) {
-            /* multi-texture mapping */
-            const uint unit = 1;
-            vector float colors1[4];
-
-            eval_coeff(2, (float) x, (float) y, texcoords);
-
-            if (spu_extract(mask, 0))
-               colors1[0] = spu.sample_texture[unit](unit, texcoords[0]);
-            if (spu_extract(mask, 1))
-               colors1[1] = spu.sample_texture[unit](unit, texcoords[1]);
-            if (spu_extract(mask, 2))
-               colors1[2] = spu.sample_texture[unit](unit, texcoords[2]);
-            if (spu_extract(mask, 3))
-               colors1[3] = spu.sample_texture[unit](unit, texcoords[3]);
-
-            /* hack: modulate first texture by second */
-            colors[0] = spu_mul(colors[0], colors1[0]);
-            colors[1] = spu_mul(colors[1], colors1[1]);
-            colors[2] = spu_mul(colors[2], colors1[2]);
-            colors[3] = spu_mul(colors[3], colors1[3]);
-         }
+      {
+         /*
+          * Run fragment shader, execute per-fragment ops, update fb/tile.
+          */
+         vector float inputs[4*4], outputs[2*4];
+         vector float fragZ = eval_z((float) x, (float) y);
+         vector float fragW = eval_w((float) x, (float) y);
+         vector unsigned int kill_mask;
 
-      }
-      else {
-         /* simple shading */
+         /* setup inputs */
 #if 0
-         eval_coeff(1, (float) x, (float) y, colors);
-
+         eval_coeff_soa(1, (float) x, (float) y, fragW, inputs);
 #else
-         /* XXX new fragment program code */
-
-         if (spu.fragment_program) {
-            vector float inputs[4*4], outputs[2*4];
-
-            /* setup inputs */
-            eval_coeff(1, (float) x, (float) y, inputs);
-
-            /* Execute the current fragment program */
-            spu.fragment_program(inputs, outputs, spu.constants);
-
-            /* Copy outputs */
-            colors[0] = outputs[0*4+0];
-            colors[1] = outputs[0*4+1];
-            colors[2] = outputs[0*4+2];
-            colors[3] = outputs[0*4+3];
-
-            if (0 && spu.init.id==0 && y == 48) {
-               printf("colors[0] = %f %f %f %f\n",
-                      spu_extract(colors[0], 0),
-                      spu_extract(colors[0], 1),
-                      spu_extract(colors[0], 2),
-                      spu_extract(colors[0], 3));
-               printf("colors[1] = %f %f %f %f\n",
-                      spu_extract(colors[1], 0),
-                      spu_extract(colors[1], 1),
-                      spu_extract(colors[1], 2),
-                      spu_extract(colors[1], 3));
-            }
-
+         uint i;
+         for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+            eval_coeff_soa(i+1, (float) x, (float) y, fragW, inputs + i * 4);
          }
 #endif
-      }
-
-
-      {
-         /* Convert fragment data from AoS to SoA format.
-          * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
-          * This is temporary!
+         ASSERT(spu.fragment_program);
+         ASSERT(spu.fragment_ops);
+
+         /* Execute the current fragment program */
+         kill_mask = spu.fragment_program(inputs, outputs, spu.constants);
+
+         mask = spu_andc(mask, kill_mask);
+
+         /* Execute per-fragment/quad operations, including:
+          * alpha test, z test, stencil test, blend and framebuffer writing.
+          * Note that there are two different fragment operations functions
+          * that can be called, one for front-facing fragments, and one
+          * for back-facing fragments.  (Often the two are the same;
+          * but in some cases, like two-sided stenciling, they can be
+          * very different.)  So choose the correct function depending
+          * on the calculated facing.
           */
-         vector float soa_frag[4];
-         _transpose_matrix4x4(soa_frag, colors);
-
-         float4 fragZ;
-
-         fragZ.v = eval_z((float) x, (float) y);
-
-         /* Do all per-fragment/quad operations here, including:
-          *  alpha test, z test, stencil test, blend and framebuffer writing.
-          */
-         spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
-                          fragZ.v,
-                          soa_frag[0], soa_frag[1],
-                          soa_frag[2], soa_frag[3],
+         spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile,
+                          fragZ,
+                          outputs[0*4+0],
+                          outputs[0*4+1],
+                          outputs[0*4+2],
+                          outputs[0*4+3],
                           mask);
       }
-
    }
 }
 
@@ -383,7 +298,8 @@ emit_quad( int x, int y, mask_t mask )
  * Given an X or Y coordinate, return the block/quad coordinate that it
  * belongs to.
  */
-static INLINE int block( int x )
+static INLINE int
+block(int x)
 {
    return x & ~1;
 }
@@ -394,7 +310,8 @@ static INLINE int block( int x )
  * the triangle's bounds.
  * The mask is a uint4 vector and each element will be 0 or 0xffffffff.
  */
-static INLINE mask_t calculate_mask( int x )
+static INLINE mask_t
+calculate_mask(int x)
 {
    /* This is a little tricky.
     * Use & instead of && to avoid branches.
@@ -412,7 +329,8 @@ static INLINE mask_t calculate_mask( int x )
 /**
  * Render a horizontal span of quads
  */
-static void flush_spans( void )
+static void
+flush_spans(void)
 {
    int minleft, maxright;
    int x;
@@ -440,7 +358,6 @@ static void flush_spans( void )
       return;
    }
 
-
    /* OK, we're very likely to need the tile data now.
     * clear or finish waiting if needed.
     */
@@ -457,7 +374,7 @@ static void flush_spans( void )
    }
    ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
 
-   if (spu.read_depth) {
+   if (spu.read_depth_stencil) {
       if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
          /* wait for mfc_get() to complete */
          //printf("SPU: %u: waiting for ztile\n", spu.init.id);
@@ -476,9 +393,7 @@ static void flush_spans( void )
     * calculate_mask() could be simplified a bit...
     */
    for (x = block(minleft); x <= block(maxright); x += 2) {
-#if 1
-      emit_quad( x, setup.span.y, calculate_mask( x ) );
-#endif
+      emit_quad( x, setup.span.y, calculate_mask( x ));
    }
 
    setup.span.y = 0;
@@ -487,33 +402,46 @@ static void flush_spans( void )
    setup.span.right[1] = 0;
 }
 
+
 #if DEBUG_VERTS
-static void print_vertex(const struct vertex_header *v)
+static void
+print_vertex(const struct vertex_header *v)
 {
-   int i;
-   fprintf(stderr, "Vertex: (%p)\n", v);
-   for (i = 0; i < setup.quad.nr_attrs; i++) {
-      fprintf(stderr, "  %d: %f %f %f %f\n",  i, 
-              v->data[i][0], v->data[i][1], v->data[i][2], v->data[i][3]);
+   uint i;
+   fprintf(stderr, "  Vertex: (%p)\n", v);
+   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+      fprintf(stderr, "    %d: %f %f %f %f\n",  i, 
+              spu_extract(v->data[i], 0),
+              spu_extract(v->data[i], 1),
+              spu_extract(v->data[i], 2),
+              spu_extract(v->data[i], 3));
    }
 }
 #endif
 
 
-static boolean setup_sort_vertices(const struct vertex_header *v0,
-                                   const struct vertex_header *v1,
-                                   const struct vertex_header *v2)
+/**
+ * Sort vertices from top to bottom.
+ * Compute area and determine front vs. back facing.
+ * Do coarse clip test against tile bounds
+ * \return  FALSE if tri is totally outside tile, TRUE otherwise
+ */
+static boolean
+setup_sort_vertices(const struct vertex_header *v0,
+                    const struct vertex_header *v1,
+                    const struct vertex_header *v2)
 {
+   float area, sign;
 
 #if DEBUG_VERTS
-   fprintf(stderr, "Triangle:\n");
-   print_vertex(v0);
-   print_vertex(v1);
-   print_vertex(v2);
+   if (spu.init.id==0) {
+      fprintf(stderr, "SPU %u: Triangle:\n", spu.init.id);
+      print_vertex(v0);
+      print_vertex(v1);
+      print_vertex(v2);
+   }
 #endif
 
-   setup.vprovoke = v2;
-
    /* determine bottom to top order of vertices */
    {
       float y0 = spu_extract(v0->data[0], 1);
@@ -525,18 +453,21 @@ static boolean setup_sort_vertices(const struct vertex_header *v0,
 	    setup.vmin = v0;   
 	    setup.vmid = v1;   
 	    setup.vmax = v2;
+            sign = -1.0f;
 	 }
 	 else if (y2 <= y0) {
 	    /* y2<=y0<=y1 */
 	    setup.vmin = v2;   
 	    setup.vmid = v0;   
 	    setup.vmax = v1;   
+            sign = -1.0f;
 	 }
 	 else {
 	    /* y0<=y2<=y1 */
 	    setup.vmin = v0;   
 	    setup.vmid = v2;   
 	    setup.vmax = v1;  
+            sign = 1.0f;
 	 }
       }
       else {
@@ -545,18 +476,21 @@ static boolean setup_sort_vertices(const struct vertex_header *v0,
 	    setup.vmin = v1;   
 	    setup.vmid = v0;   
 	    setup.vmax = v2;  
+            sign = 1.0f;
 	 }
 	 else if (y2 <= y1) {
 	    /* y2<=y1<=y0 */
 	    setup.vmin = v2;   
 	    setup.vmid = v1;   
 	    setup.vmax = v0;  
+            sign = 1.0f;
 	 }
 	 else {
 	    /* y1<=y2<=y0 */
 	    setup.vmin = v1;   
 	    setup.vmid = v2;   
 	    setup.vmax = v0;
+            sign = -1.0f;
 	 }
       }
    }
@@ -585,31 +519,23 @@ static boolean setup_sort_vertices(const struct vertex_header *v0,
    /*
     * Compute triangle's area.  Use 1/area to compute partial
     * derivatives of attributes later.
-    *
-    * The area will be the same as prim->det, but the sign may be
-    * different depending on how the vertices get sorted above.
-    *
-    * To determine whether the primitive is front or back facing we
-    * use the prim->det value because its sign is correct.
     */
-   {
-      const float area = (setup.emaj.dx * setup.ebot.dy - 
-			    setup.ebot.dx * setup.emaj.dy);
-
-      setup.oneoverarea = 1.0f / area;
-      /*
-      _mesa_printf("%s one-over-area %f  area %f  det %f\n",
-                   __FUNCTION__, setup.oneoverarea, area, prim->det );
-      */
-   }
+   area = setup.emaj.dx * setup.ebot.dy - setup.ebot.dx * setup.emaj.dy;
 
-#if 0
-   /* We need to know if this is a front or back-facing triangle for:
-    *  - the GLSL gl_FrontFacing fragment attribute (bool)
-    *  - two-sided stencil test
+   setup.oneOverArea = 1.0f / area;
+
+   /* The product of area * sign indicates front/back orientation (0/1).
+    * Just in case someone gets the bright idea of switching the front
+    * and back constants without noticing that we're assuming their
+    * values in this operation, also assert that the values are
+    * what we think they are.
     */
-   setup.quad.facing = (prim->det > 0.0) ^ (setup.softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
-#endif
+   ASSERT(CELL_FACING_FRONT == 0);
+   ASSERT(CELL_FACING_BACK == 1);
+   setup.facing = (area * sign > 0.0f)
+      ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
+
+   setup.vprovoke = v2;
 
    return TRUE;
 }
@@ -622,63 +548,11 @@ static boolean setup_sort_vertices(const struct vertex_header *v0,
  * \param slot  which attribute slot 
  */
 static INLINE void
-const_coeff(uint slot)
+const_coeff4(uint slot)
 {
-   setup.coef[slot].dadx.v = (vector float) {0.0, 0.0, 0.0, 0.0};
-   setup.coef[slot].dady.v = (vector float) {0.0, 0.0, 0.0, 0.0};
-   setup.coef[slot].a0.v = setup.vprovoke->data[slot];
-}
-
-
-/**
- * Compute a0, dadx and dady for a linearly interpolated coefficient,
- * for a triangle.
- */
-static INLINE void
-tri_linear_coeff(uint slot, uint firstComp, uint lastComp)
-{
-   uint i;
-   const float *vmin_d = (float *) &setup.vmin->data[slot];
-   const float *vmid_d = (float *) &setup.vmid->data[slot];
-   const float *vmax_d = (float *) &setup.vmax->data[slot];
-   const float x = spu_extract(setup.vmin->data[0], 0) - 0.5f;
-   const float y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
-
-   for (i = firstComp; i < lastComp; i++) {
-      float botda = vmid_d[i] - vmin_d[i];
-      float majda = vmax_d[i] - vmin_d[i];
-      float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
-      float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
-   
-      ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
-
-      setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
-      setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
-
-      /* calculate a0 as the value which would be sampled for the
-       * fragment at (0,0), taking into account that we want to sample at
-       * pixel centers, in other words (0.5, 0.5).
-       *
-       * this is neat but unfortunately not a good way to do things for
-       * triangles with very large values of dadx or dady as it will
-       * result in the subtraction and re-addition from a0 of a very
-       * large number, which means we'll end up loosing a lot of the
-       * fractional bits and precision from a0.  the way to fix this is
-       * to define a0 as the sample at a pixel center somewhere near vmin
-       * instead - i'll switch to this later.
-       */
-      setup.coef[slot].a0.f[i] = (vmin_d[i] - 
-                                 (setup.coef[slot].dadx.f[i] * x + 
-                                  setup.coef[slot].dady.f[i] * y));
-   }
-
-   /*
-   _mesa_printf("attr[%d].%c: %f dx:%f dy:%f\n",
-		slot, "xyzw"[i], 
-		setup.coef[slot].a0[i],
-		setup.coef[slot].dadx.f[i],
-		setup.coef[slot].dady.f[i]);
-   */
+   setup.coef[slot].dadx = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].dady = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].a0 = setup.vprovoke->data[slot];
 }
 
 
@@ -702,18 +576,16 @@ tri_linear_coeff4(uint slot)
    vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
                             spu_mul(majda, spu_splats(setup.ebot.dx)));
 
-   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneoverarea));
-   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneoverarea));
+   setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
 
-   vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
-   vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
+   vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
+   vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
                          
-   setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy));
+   setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
 }
 
 
-
-#if 0
 /**
  * Compute a0, dadx and dady for a perspective-corrected interpolant,
  * for a triangle.
@@ -722,82 +594,76 @@ tri_linear_coeff4(uint slot)
  * Later, when we compute the value at a particular fragment position we'll
  * divide the interpolated value by the interpolated W at that fragment.
  */
-static void tri_persp_coeff( unsigned slot,
-                             unsigned i )
+static void
+tri_persp_coeff4(uint slot)
 {
-   /* premultiply by 1/w:
-    */
-   float mina = setup.vmin->data[slot][i] * setup.vmin->data[0][3];
-   float mida = setup.vmid->data[slot][i] * setup.vmid->data[0][3];
-   float maxa = setup.vmax->data[slot][i] * setup.vmax->data[0][3];
-
-   float botda = mida - mina;
-   float majda = maxa - mina;
-   float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
-   float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
-      
-   /*
-   printf("tri persp %d,%d: %f %f %f\n", slot, i,
-          setup.vmin->data[slot][i],
-          setup.vmid->data[slot][i],
-          setup.vmax->data[slot][i]
-          );
-   */
+   const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
+   const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
+
+   const vector float vmin_w = spu_splats(spu_extract(setup.vmin->data[0], 3));
+   const vector float vmid_w = spu_splats(spu_extract(setup.vmid->data[0], 3));
+   const vector float vmax_w = spu_splats(spu_extract(setup.vmax->data[0], 3));
+
+   vector float vmin_d = setup.vmin->data[slot];
+   vector float vmid_d = setup.vmid->data[slot];
+   vector float vmax_d = setup.vmax->data[slot];
+
+   vmin_d = spu_mul(vmin_d, vmin_w);
+   vmid_d = spu_mul(vmid_d, vmid_w);
+   vmax_d = spu_mul(vmax_d, vmax_w);
+
+   vector float botda = vmid_d - vmin_d;
+   vector float majda = vmax_d - vmin_d;
 
-   assert(slot < PIPE_MAX_SHADER_INPUTS);
-   assert(i <= 3);
+   vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
+                            spu_mul(botda, spu_splats(setup.emaj.dy)));
+   vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
+                            spu_mul(majda, spu_splats(setup.ebot.dx)));
 
-   setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
-   setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
-   setup.coef[slot].a0.f[i] = (mina - 
-			    (setup.coef[slot].dadx.f[i] * (setup.vmin->data[0][0] - 0.5f) + 
-			     setup.coef[slot].dady.f[i] * (setup.vmin->data[0][1] - 0.5f)));
+   setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
+
+   vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
+   vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
+                         
+   setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
 }
-#endif
+
 
 
 /**
  * Compute the setup.coef[] array dadx, dady, a0 values.
  * Must be called after setup.vmin,vmid,vmax,vprovoke are initialized.
  */
-static void setup_tri_coefficients(void)
+static void
+setup_tri_coefficients(void)
 {
-#if 1
    uint i;
 
    for (i = 0; i < spu.vertex_info.num_attribs; i++) {
-      switch (spu.vertex_info.interp_mode[i]) {
+      switch (spu.vertex_info.attrib[i].interp_mode) {
       case INTERP_NONE:
          break;
-      case INTERP_POS:
-         /*tri_linear_coeff(i, 2, 3);*/
-         /* XXX interp W if PERSPECTIVE... */
-         tri_linear_coeff4(i);
-         break;
       case INTERP_CONSTANT:
-         const_coeff(i);
+         const_coeff4(i);
          break;
+      case INTERP_POS:
+         /* fall-through */
       case INTERP_LINEAR:
          tri_linear_coeff4(i);
          break;
       case INTERP_PERSPECTIVE:
-         tri_linear_coeff4(i);  /* temporary */
+         tri_persp_coeff4(i);
          break;
       default:
          ASSERT(0);
       }
    }
-#else
-   ASSERT(spu.vertex_info.interp_mode[0] == INTERP_POS);
-   ASSERT(spu.vertex_info.interp_mode[1] == INTERP_LINEAR ||
-          spu.vertex_info.interp_mode[1] == INTERP_CONSTANT);
-   tri_linear_coeff(0, 2, 3);  /* slot 0, z */
-   tri_linear_coeff(1, 0, 4);  /* slot 1, color */
-#endif
 }
 
 
-static void setup_tri_edges(void)
+static void
+setup_tri_edges(void)
 {
    float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f;
    float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f;
@@ -827,9 +693,8 @@ static void setup_tri_edges(void)
  * Render the upper or lower half of a triangle.
  * Scissoring/cliprect is applied here too.
  */
-static void subtriangle( struct edge *eleft,
-			 struct edge *eright,
-			 unsigned lines )
+static void
+subtriangle(struct edge *eleft, struct edge *eright, unsigned lines)
 {
    const int minx = setup.cliprect_minx;
    const int maxx = setup.cliprect_maxx;
@@ -902,7 +767,8 @@ static void subtriangle( struct edge *eleft,
  * The tile data should have already been fetched.
  */
 boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
+tri_draw(const float *v0, const float *v1, const float *v2,
+         uint tx, uint ty)
 {
    setup.tx = tx;
    setup.ty = ty;
@@ -926,19 +792,14 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
    setup.span.y_flags = 0;
    setup.span.right[0] = 0;
    setup.span.right[1] = 0;
-   /*   setup.span.z_mode = tri_z_mode( setup.ctx ); */
 
-   /*   init_constant_attribs( setup ); */
-      
-   if (setup.oneoverarea < 0.0) {
-      /* emaj on left:
-       */
+   if (setup.oneOverArea < 0.0) {
+      /* emaj on left */
       subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines );
       subtriangle( &setup.emaj, &setup.etop, setup.etop.lines );
    }
    else {
-      /* emaj on right:
-       */
+      /* emaj on right */
       subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines );
       subtriangle( &setup.etop, &setup.emaj, setup.etop.lines );
    }
diff --git a/src/gallium/drivers/i915simple/i915_debug.c b/src/gallium/drivers/i915simple/i915_debug.c
index 5e26d1b9057..7a4e7051d20 100644
--- a/src/gallium/drivers/i915simple/i915_debug.c
+++ b/src/gallium/drivers/i915simple/i915_debug.c
@@ -210,6 +210,7 @@ BITS(
    PRINTF(stream, ": 0x%x\n", ((dw) & himask) >> (lo));
 }
 
+#ifdef DEBUG
 #define MBZ( dw, hi, lo) do {							\
    unsigned x = (dw) >> (lo);				\
    unsigned lomask = (1 << (lo)) - 1;			\
@@ -217,6 +218,10 @@ BITS(
    himask = (1UL << (hi)) - 1;				\
    assert ((x & himask & ~lomask) == 0);	\
 } while (0)
+#else
+#define MBZ( dw, hi, lo) do {							\
+} while (0)
+#endif
 
 static void
 FLAG(
diff --git a/src/gallium/drivers/i915simple/i915_fpc_translate.c b/src/gallium/drivers/i915simple/i915_fpc_translate.c
index 34b4a846c11..43d62c51765 100644
--- a/src/gallium/drivers/i915simple/i915_fpc_translate.c
+++ b/src/gallium/drivers/i915simple/i915_fpc_translate.c
@@ -144,7 +144,7 @@ src_vector(struct i915_fp_compile *p,
            const struct tgsi_full_src_register *source)
 {
    uint index = source->SrcRegister.Index;
-   uint src, sem_name, sem_ind;
+   uint src = 0, sem_name, sem_ind;
 
    switch (source->SrcRegister.File) {
    case TGSI_FILE_TEMPORARY:
diff --git a/src/gallium/drivers/i915simple/i915_texture.c b/src/gallium/drivers/i915simple/i915_texture.c
index bd87217063c..c65c0642319 100644
--- a/src/gallium/drivers/i915simple/i915_texture.c
+++ b/src/gallium/drivers/i915simple/i915_texture.c
@@ -206,11 +206,10 @@ i945_miptree_layout_2d( struct i915_texture *tex )
    unsigned nblocksx = pt->nblocksx[0];
    unsigned nblocksy = pt->nblocksy[0];
 
-#if 0 /* used for tiled display targets */
-   if (pt->last_level == 0 && pt->block.size == 4)
+   /* used for tiled display targets */
+   if (0)
       if (i915_displaytarget_layout(tex))
 	 return;
-#endif
 
    tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
 
diff --git a/src/gallium/drivers/nouveau/nouveau_bo.h b/src/gallium/drivers/nouveau/nouveau_bo.h
new file mode 100644
index 00000000000..0ed3367815a
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_bo.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __NOUVEAU_BO_H__
+#define __NOUVEAU_BO_H__
+
+/* Relocation/Buffer type flags */
+#define NOUVEAU_BO_VRAM  (1 << 0)
+#define NOUVEAU_BO_GART  (1 << 1)
+#define NOUVEAU_BO_RD    (1 << 2)
+#define NOUVEAU_BO_WR    (1 << 3)
+#define NOUVEAU_BO_RDWR  (NOUVEAU_BO_RD | NOUVEAU_BO_WR)
+#define NOUVEAU_BO_MAP   (1 << 4)
+#define NOUVEAU_BO_PIN   (1 << 5)
+#define NOUVEAU_BO_LOW   (1 << 6)
+#define NOUVEAU_BO_HIGH  (1 << 7)
+#define NOUVEAU_BO_OR    (1 << 8)
+#define NOUVEAU_BO_LOCAL (1 << 9)
+#define NOUVEAU_BO_TILED (1 << 10)
+#define NOUVEAU_BO_ZTILE (1 << 11)
+#define NOUVEAU_BO_SWIZZLED (1 << 12)
+#define NOUVEAU_BO_DUMMY (1 << 31)
+
+struct nouveau_bo {
+	struct nouveau_device *device;
+	uint64_t handle;
+
+	uint64_t size;
+	void *map;
+
+	uint32_t flags;
+	uint64_t offset;
+};
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_channel.h b/src/gallium/drivers/nouveau/nouveau_channel.h
new file mode 100644
index 00000000000..cd99a676bdc
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_channel.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __NOUVEAU_CHANNEL_H__
+#define __NOUVEAU_CHANNEL_H__
+
+struct nouveau_channel {
+	struct nouveau_device *device;
+	int id;
+
+	struct nouveau_pushbuf *pushbuf;
+
+	struct nouveau_grobj *nullobj;
+	struct nouveau_grobj *vram;
+	struct nouveau_grobj *gart;
+
+	void *user_private;
+	void (*hang_notify)(struct nouveau_channel *);
+};
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_class.h b/src/gallium/drivers/nouveau/nouveau_class.h
new file mode 100644
index 00000000000..c3d8d7539d1
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_class.h
@@ -0,0 +1,6259 @@
+/*************************************************************************
+
+   Autogenerated file, do not edit !
+
+**************************************************************************
+
+   Copyright (C) 2006-2008 :
+   Dmitry Baryshkov,
+   Laurent Carlier,
+   Matthieu Castet,
+   Dawid Gajownik,
+   Jeremy Kolb,
+   Stephane Loeuillet,
+   Patrice Mandin,
+   Stephane Marchesin,
+   Serge Martin,
+   Sylvain Munaut,
+   Simon Raffeiner,
+   Ben Skeggs,
+   Erik Waling,
+   koala_br,
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+*************************************************************************/
+
+
+#ifndef NOUVEAU_REG_H
+#define NOUVEAU_REG_H 1
+
+
+#define NV01_ROOT									0x00000001
+
+
+
+#define NV01_CONTEXT_DMA								0x00000002
+
+
+
+#define NV01_DEVICE									0x00000003
+
+
+
+#define NV01_TIMER									0x00000004
+
+#define  NV01_TIMER_SYNCHRONIZE								0x00000100
+#define  NV01_TIMER_STOP_ALARM								0x00000104
+#define  NV01_TIMER_DMA_NOTIFY								0x00000180
+#define  NV01_TIMER_TIME(x)								(0x00000300+((x)*4))
+#define  NV01_TIMER_TIME__SIZE								0x00000002
+#define  NV01_TIMER_ALARM_NOTIFY							0x00000308
+
+
+#define NV_IMAGE_STENCIL								0x00000010
+
+#define  NV_IMAGE_STENCIL_NOTIFY							0x00000104
+#define  NV_IMAGE_STENCIL_DMA_NOTIFY							0x00000180
+#define  NV_IMAGE_STENCIL_IMAGE_OUTPUT							0x00000200
+#define  NV_IMAGE_STENCIL_IMAGE_INPUT(x)						(0x00000204+((x)*4))
+#define  NV_IMAGE_STENCIL_IMAGE_INPUT__SIZE						0x00000002
+
+
+#define NV_IMAGE_BLEND_AND								0x00000011
+
+#define  NV_IMAGE_BLEND_AND_NOP								0x00000100
+#define  NV_IMAGE_BLEND_AND_NOTIFY							0x00000104
+#define  NV_IMAGE_BLEND_AND_DMA_NOTIFY							0x00000180
+#define  NV_IMAGE_BLEND_AND_IMAGE_OUTPUT						0x00000200
+#define  NV_IMAGE_BLEND_AND_BETA_INPUT							0x00000204
+#define  NV_IMAGE_BLEND_AND_IMAGE_INPUT							0x00000208
+
+
+#define NV01_CONTEXT_BETA1								0x00000012
+
+#define  NV01_CONTEXT_BETA1_NOP								0x00000100
+#define  NV01_CONTEXT_BETA1_NOTIFY							0x00000104
+#define  NV01_CONTEXT_BETA1_DMA_NOTIFY							0x00000180
+#define  NV01_CONTEXT_BETA1_BETA_1D31							0x00000300
+
+
+#define NV_IMAGE_ROP_AND								0x00000013
+
+#define  NV_IMAGE_ROP_AND_NOTIFY							0x00000104
+#define  NV_IMAGE_ROP_AND_DMA_NOTIFY							0x00000180
+#define  NV_IMAGE_ROP_AND_IMAGE_OUTPUT							0x00000200
+#define  NV_IMAGE_ROP_AND_ROP_INPUT							0x00000204
+#define  NV_IMAGE_ROP_AND_IMAGE_INPUT(x)						(0x00000208+((x)*4))
+#define  NV_IMAGE_ROP_AND_IMAGE_INPUT__SIZE						0x00000002
+
+
+#define NV_IMAGE_COLOR_KEY								0x00000015
+
+
+
+#define NV01_CONTEXT_COLOR_KEY								0x00000017
+
+#define  NV01_CONTEXT_COLOR_KEY_NOP							0x00000100
+#define  NV01_CONTEXT_COLOR_KEY_NOTIFY							0x00000104
+#define  NV01_CONTEXT_COLOR_KEY_DMA_NOTIFY						0x00000180
+#define  NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT						0x00000300
+#define   NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT_X16A8Y8					0x00000001
+#define   NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT_X24Y8					0x00000002
+#define   NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT_X16A1R5G5B5				0x00000003
+#define   NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT_X17R5G5B5					0x00000004
+#define   NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT_A8R8G8B8					0x00000005
+#define   NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT_X8R8G8B8					0x00000006
+#define   NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT_A16Y16					0x00000007
+#define   NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT_X16Y16					0x00000008
+#define  NV01_CONTEXT_COLOR_KEY_COLOR							0x00000304
+
+
+#define NV01_CONTEXT_PATTERN								0x00000018
+
+#define  NV01_CONTEXT_PATTERN_NOP							0x00000100
+#define  NV01_CONTEXT_PATTERN_NOTIFY							0x00000104
+#define  NV01_CONTEXT_PATTERN_DMA_NOTIFY						0x00000180
+#define  NV01_CONTEXT_PATTERN_COLOR_FORMAT						0x00000300
+#define  NV01_CONTEXT_PATTERN_MONOCHROME_FORMAT						0x00000304
+#define  NV01_CONTEXT_PATTERN_SHAPE							0x00000308
+#define  NV01_CONTEXT_PATTERN_COLOR(x)							(0x00000310+((x)*4))
+#define  NV01_CONTEXT_PATTERN_COLOR__SIZE						0x00000002
+#define  NV01_CONTEXT_PATTERN_PATTERN(x)						(0x00000318+((x)*4))
+#define  NV01_CONTEXT_PATTERN_PATTERN__SIZE						0x00000002
+
+
+#define NV01_CONTEXT_CLIP_RECTANGLE							0x00000019
+
+#define  NV01_CONTEXT_CLIP_RECTANGLE_NOP						0x00000100
+#define  NV01_CONTEXT_CLIP_RECTANGLE_NOTIFY						0x00000104
+#define  NV01_CONTEXT_CLIP_RECTANGLE_DMA_NOTIFY						0x00000180
+#define  NV01_CONTEXT_CLIP_RECTANGLE_POINT						0x00000300
+#define   NV01_CONTEXT_CLIP_RECTANGLE_POINT_X_SHIFT					0
+#define   NV01_CONTEXT_CLIP_RECTANGLE_POINT_X_MASK					0x0000ffff
+#define   NV01_CONTEXT_CLIP_RECTANGLE_POINT_Y_SHIFT					16
+#define   NV01_CONTEXT_CLIP_RECTANGLE_POINT_Y_MASK					0xffff0000
+#define  NV01_CONTEXT_CLIP_RECTANGLE_SIZE						0x00000304
+#define   NV01_CONTEXT_CLIP_RECTANGLE_SIZE_W_SHIFT					0
+#define   NV01_CONTEXT_CLIP_RECTANGLE_SIZE_W_MASK					0x0000ffff
+#define   NV01_CONTEXT_CLIP_RECTANGLE_SIZE_H_SHIFT					16
+#define   NV01_CONTEXT_CLIP_RECTANGLE_SIZE_H_MASK					0xffff0000
+
+
+#define NV01_RENDER_SOLID_LINE								0x0000001c
+
+#define  NV01_RENDER_SOLID_LINE_NOP							0x00000100
+#define  NV01_RENDER_SOLID_LINE_NOTIFY							0x00000104
+#define  NV01_RENDER_SOLID_LINE_PATCH							0x0000010c
+#define  NV01_RENDER_SOLID_LINE_DMA_NOTIFY						0x00000180
+#define  NV01_RENDER_SOLID_LINE_CLIP_RECTANGLE						0x00000184
+#define  NV01_RENDER_SOLID_LINE_PATTERN							0x00000188
+#define  NV01_RENDER_SOLID_LINE_ROP							0x0000018c
+#define  NV01_RENDER_SOLID_LINE_BETA1							0x00000190
+#define  NV01_RENDER_SOLID_LINE_SURFACE							0x00000194
+#define  NV01_RENDER_SOLID_LINE_OPERATION						0x000002fc
+#define   NV01_RENDER_SOLID_LINE_OPERATION_SRCCOPY_AND					0x00000000
+#define   NV01_RENDER_SOLID_LINE_OPERATION_ROP_AND					0x00000001
+#define   NV01_RENDER_SOLID_LINE_OPERATION_BLEND_AND					0x00000002
+#define   NV01_RENDER_SOLID_LINE_OPERATION_SRCCOPY					0x00000003
+#define   NV01_RENDER_SOLID_LINE_OPERATION_SRCCOPY_PREMULT				0x00000004
+#define   NV01_RENDER_SOLID_LINE_OPERATION_BLEND_PREMULT				0x00000005
+#define  NV01_RENDER_SOLID_LINE_COLOR_FORMAT						0x00000300
+#define   NV01_RENDER_SOLID_LINE_COLOR_FORMAT_X16A8Y8					0x00000001
+#define   NV01_RENDER_SOLID_LINE_COLOR_FORMAT_X24Y8					0x00000002
+#define   NV01_RENDER_SOLID_LINE_COLOR_FORMAT_X16A1R5G5B5				0x00000003
+#define   NV01_RENDER_SOLID_LINE_COLOR_FORMAT_X17R5G5B5					0x00000004
+#define   NV01_RENDER_SOLID_LINE_COLOR_FORMAT_A8R8G8B8					0x00000005
+#define   NV01_RENDER_SOLID_LINE_COLOR_FORMAT_X8R8G8B8					0x00000006
+#define   NV01_RENDER_SOLID_LINE_COLOR_FORMAT_A16Y16					0x00000007
+#define   NV01_RENDER_SOLID_LINE_COLOR_FORMAT_X16Y16					0x00000008
+#define  NV01_RENDER_SOLID_LINE_COLOR							0x00000304
+#define  NV01_RENDER_SOLID_LINE_LINE_POINT0(x)						(0x00000400+((x)*8))
+#define  NV01_RENDER_SOLID_LINE_LINE_POINT0__SIZE					0x00000010
+#define   NV01_RENDER_SOLID_LINE_LINE_POINT0_X_SHIFT					0
+#define   NV01_RENDER_SOLID_LINE_LINE_POINT0_X_MASK					0x0000ffff
+#define   NV01_RENDER_SOLID_LINE_LINE_POINT0_Y_SHIFT					16
+#define   NV01_RENDER_SOLID_LINE_LINE_POINT0_Y_MASK					0xffff0000
+#define  NV01_RENDER_SOLID_LINE_LINE_POINT1(x)						(0x00000404+((x)*8))
+#define  NV01_RENDER_SOLID_LINE_LINE_POINT1__SIZE					0x00000010
+#define   NV01_RENDER_SOLID_LINE_LINE_POINT1_X_SHIFT					0
+#define   NV01_RENDER_SOLID_LINE_LINE_POINT1_X_MASK					0x0000ffff
+#define   NV01_RENDER_SOLID_LINE_LINE_POINT1_Y_SHIFT					16
+#define   NV01_RENDER_SOLID_LINE_LINE_POINT1_Y_MASK					0xffff0000
+#define  NV01_RENDER_SOLID_LINE_LINE32_POINT0_X(x)					(0x00000480+((x)*16))
+#define  NV01_RENDER_SOLID_LINE_LINE32_POINT0_X__SIZE					0x00000010
+#define  NV01_RENDER_SOLID_LINE_LINE32_POINT0_Y(x)					(0x00000484+((x)*16))
+#define  NV01_RENDER_SOLID_LINE_LINE32_POINT0_Y__SIZE					0x00000010
+#define  NV01_RENDER_SOLID_LINE_LINE32_POINT1_X(x)					(0x00000488+((x)*16))
+#define  NV01_RENDER_SOLID_LINE_LINE32_POINT1_X__SIZE					0x00000010
+#define  NV01_RENDER_SOLID_LINE_LINE32_POINT1_Y(x)					(0x0000048c+((x)*16))
+#define  NV01_RENDER_SOLID_LINE_LINE32_POINT1_Y__SIZE					0x00000010
+#define  NV01_RENDER_SOLID_LINE_POLYLINE(x)						(0x00000500+((x)*4))
+#define  NV01_RENDER_SOLID_LINE_POLYLINE__SIZE						0x00000020
+#define   NV01_RENDER_SOLID_LINE_POLYLINE_X_SHIFT					0
+#define   NV01_RENDER_SOLID_LINE_POLYLINE_X_MASK					0x0000ffff
+#define   NV01_RENDER_SOLID_LINE_POLYLINE_Y_SHIFT					16
+#define   NV01_RENDER_SOLID_LINE_POLYLINE_Y_MASK					0xffff0000
+#define  NV01_RENDER_SOLID_LINE_POLYLINE32_POINT_X(x)					(0x00000580+((x)*8))
+#define  NV01_RENDER_SOLID_LINE_POLYLINE32_POINT_X__SIZE				0x00000010
+#define  NV01_RENDER_SOLID_LINE_POLYLINE32_POINT_Y(x)					(0x00000584+((x)*8))
+#define  NV01_RENDER_SOLID_LINE_POLYLINE32_POINT_Y__SIZE				0x00000010
+#define  NV01_RENDER_SOLID_LINE_CPOLYLINE_COLOR(x)					(0x00000600+((x)*8))
+#define  NV01_RENDER_SOLID_LINE_CPOLYLINE_COLOR__SIZE					0x00000010
+#define  NV01_RENDER_SOLID_LINE_CPOLYLINE_POINT(x)					(0x00000604+((x)*8))
+#define  NV01_RENDER_SOLID_LINE_CPOLYLINE_POINT__SIZE					0x00000010
+#define   NV01_RENDER_SOLID_LINE_CPOLYLINE_POINT_X_SHIFT				0
+#define   NV01_RENDER_SOLID_LINE_CPOLYLINE_POINT_X_MASK					0x0000ffff
+#define   NV01_RENDER_SOLID_LINE_CPOLYLINE_POINT_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_LINE_CPOLYLINE_POINT_Y_MASK					0xffff0000
+
+
+#define NV01_RENDER_SOLID_TRIANGLE							0x0000001d
+
+#define  NV01_RENDER_SOLID_TRIANGLE_NOP							0x00000100
+#define  NV01_RENDER_SOLID_TRIANGLE_NOTIFY						0x00000104
+#define  NV01_RENDER_SOLID_TRIANGLE_PATCH						0x0000010c
+#define  NV01_RENDER_SOLID_TRIANGLE_DMA_NOTIFY						0x00000180
+#define  NV01_RENDER_SOLID_TRIANGLE_CLIP_RECTANGLE					0x00000184
+#define  NV01_RENDER_SOLID_TRIANGLE_PATTERN						0x00000188
+#define  NV01_RENDER_SOLID_TRIANGLE_ROP							0x0000018c
+#define  NV01_RENDER_SOLID_TRIANGLE_BETA1						0x00000190
+#define  NV01_RENDER_SOLID_TRIANGLE_SURFACE						0x00000194
+#define  NV01_RENDER_SOLID_TRIANGLE_OPERATION						0x000002fc
+#define   NV01_RENDER_SOLID_TRIANGLE_OPERATION_SRCCOPY_AND				0x00000000
+#define   NV01_RENDER_SOLID_TRIANGLE_OPERATION_ROP_AND					0x00000001
+#define   NV01_RENDER_SOLID_TRIANGLE_OPERATION_BLEND_AND				0x00000002
+#define   NV01_RENDER_SOLID_TRIANGLE_OPERATION_SRCCOPY					0x00000003
+#define   NV01_RENDER_SOLID_TRIANGLE_OPERATION_SRCCOPY_PREMULT				0x00000004
+#define   NV01_RENDER_SOLID_TRIANGLE_OPERATION_BLEND_PREMULT				0x00000005
+#define  NV01_RENDER_SOLID_TRIANGLE_COLOR_FORMAT					0x00000300
+#define  NV01_RENDER_SOLID_TRIANGLE_COLOR						0x00000304
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT0					0x00000310
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT0_X_SHIFT				0
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT0_X_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT0_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT0_Y_MASK				0xffff0000
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT1					0x00000314
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT1_X_SHIFT				0
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT1_X_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT1_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT1_Y_MASK				0xffff0000
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT2					0x00000318
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT2_X_SHIFT				0
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT2_X_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT2_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT2_Y_MASK				0xffff0000
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE32_POINT0_X					0x00000320
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE32_POINT0_Y					0x00000324
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE32_POINT1_X					0x00000328
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE32_POINT1_Y					0x0000032c
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE32_POINT2_X					0x00000330
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE32_POINT2_Y					0x00000334
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIMESH(x)						(0x00000400+((x)*4))
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIMESH__SIZE					0x00000020
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIMESH_X_SHIFT					0
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIMESH_X_MASK					0x0000ffff
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIMESH_Y_SHIFT					16
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIMESH_Y_MASK					0xffff0000
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIMESH32_POINT_X(x)				(0x00000480+((x)*8))
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIMESH32_POINT_X__SIZE				0x00000010
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIMESH32_POINT_Y(x)				(0x00000484+((x)*8))
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIMESH32_POINT_Y__SIZE				0x00000010
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_COLOR(x)					(0x00000500+((x)*16))
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_COLOR__SIZE				0x00000008
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT0(x)					(0x00000504+((x)*16))
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT0__SIZE				0x00000008
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT0_X_SHIFT				0
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT0_X_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT0_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT0_Y_MASK				0xffff0000
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT1(x)					(0x00000508+((x)*16))
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT1__SIZE				0x00000008
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT1_X_SHIFT				0
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT1_X_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT1_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT1_Y_MASK				0xffff0000
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT2(x)					(0x0000050c+((x)*16))
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT2__SIZE				0x00000008
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT2_X_SHIFT				0
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT2_X_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT2_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT2_Y_MASK				0xffff0000
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIMESH_COLOR(x)					(0x00000580+((x)*8))
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIMESH_COLOR__SIZE				0x00000010
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIMESH_POINT(x)					(0x00000584+((x)*8))
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIMESH_POINT__SIZE				0x00000010
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIMESH_POINT_X_SHIFT				0
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIMESH_POINT_X_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIMESH_POINT_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIMESH_POINT_Y_MASK				0xffff0000
+
+
+#define NV01_RENDER_SOLID_RECTANGLE							0x0000001e
+
+#define  NV01_RENDER_SOLID_RECTANGLE_NOP						0x00000100
+#define  NV01_RENDER_SOLID_RECTANGLE_NOTIFY						0x00000104
+#define  NV01_RENDER_SOLID_RECTANGLE_PATCH						0x0000010c
+#define  NV01_RENDER_SOLID_RECTANGLE_DMA_NOTIFY						0x00000180
+#define  NV01_RENDER_SOLID_RECTANGLE_CLIP_RECTANGLE					0x00000184
+#define  NV01_RENDER_SOLID_RECTANGLE_PATTERN						0x00000188
+#define  NV01_RENDER_SOLID_RECTANGLE_ROP						0x0000018c
+#define  NV01_RENDER_SOLID_RECTANGLE_BETA1						0x00000190
+#define  NV01_RENDER_SOLID_RECTANGLE_SURFACE						0x00000194
+#define  NV01_RENDER_SOLID_RECTANGLE_OPERATION						0x000002fc
+#define   NV01_RENDER_SOLID_RECTANGLE_OPERATION_SRCCOPY_AND				0x00000000
+#define   NV01_RENDER_SOLID_RECTANGLE_OPERATION_ROP_AND					0x00000001
+#define   NV01_RENDER_SOLID_RECTANGLE_OPERATION_BLEND_AND				0x00000002
+#define   NV01_RENDER_SOLID_RECTANGLE_OPERATION_SRCCOPY					0x00000003
+#define   NV01_RENDER_SOLID_RECTANGLE_OPERATION_SRCCOPY_PREMULT				0x00000004
+#define   NV01_RENDER_SOLID_RECTANGLE_OPERATION_BLEND_PREMULT				0x00000005
+#define  NV01_RENDER_SOLID_RECTANGLE_COLOR_FORMAT					0x00000300
+#define  NV01_RENDER_SOLID_RECTANGLE_COLOR						0x00000304
+#define  NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_POINT(x)					(0x00000400+((x)*8))
+#define  NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_POINT__SIZE				0x00000010
+#define   NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_POINT_X_SHIFT				0
+#define   NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_POINT_X_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_POINT_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_POINT_Y_MASK				0xffff0000
+#define  NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_SIZE(x)					(0x00000404+((x)*8))
+#define  NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_SIZE__SIZE				0x00000010
+#define   NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_SIZE_W_SHIFT				0
+#define   NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_SIZE_W_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_SIZE_H_SHIFT				16
+#define   NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_SIZE_H_MASK				0xffff0000
+
+
+#define NV01_IMAGE_BLIT									0x0000001f
+
+#define  NV01_IMAGE_BLIT_NOP								0x00000100
+#define  NV01_IMAGE_BLIT_NOTIFY								0x00000104
+#define  NV01_IMAGE_BLIT_PATCH								0x0000010c
+#define  NV01_IMAGE_BLIT_DMA_NOTIFY							0x00000180
+#define  NV01_IMAGE_BLIT_COLOR_KEY							0x00000184
+#define  NV01_IMAGE_BLIT_CLIP_RECTANGLE							0x00000188
+#define  NV01_IMAGE_BLIT_PATTERN							0x0000018c
+#define  NV01_IMAGE_BLIT_ROP								0x00000190
+#define  NV01_IMAGE_BLIT_BETA1								0x00000194
+#define  NV01_IMAGE_BLIT_SURFACE							0x0000019c
+#define  NV01_IMAGE_BLIT_OPERATION							0x000002fc
+#define  NV01_IMAGE_BLIT_IMAGE_INPUT							0x00000204
+#define  NV01_IMAGE_BLIT_POINT_IN							0x00000300
+#define   NV01_IMAGE_BLIT_POINT_IN_X_SHIFT						0
+#define   NV01_IMAGE_BLIT_POINT_IN_X_MASK						0x0000ffff
+#define   NV01_IMAGE_BLIT_POINT_IN_Y_SHIFT						16
+#define   NV01_IMAGE_BLIT_POINT_IN_Y_MASK						0xffff0000
+#define  NV01_IMAGE_BLIT_POINT_OUT							0x00000304
+#define   NV01_IMAGE_BLIT_POINT_OUT_X_SHIFT						0
+#define   NV01_IMAGE_BLIT_POINT_OUT_X_MASK						0x0000ffff
+#define   NV01_IMAGE_BLIT_POINT_OUT_Y_SHIFT						16
+#define   NV01_IMAGE_BLIT_POINT_OUT_Y_MASK						0xffff0000
+#define  NV01_IMAGE_BLIT_SIZE								0x00000308
+#define   NV01_IMAGE_BLIT_SIZE_W_SHIFT							0
+#define   NV01_IMAGE_BLIT_SIZE_W_MASK							0x0000ffff
+#define   NV01_IMAGE_BLIT_SIZE_H_SHIFT							16
+#define   NV01_IMAGE_BLIT_SIZE_H_MASK							0xffff0000
+
+
+#define NV01_IMAGE_FROM_CPU								0x00000021
+
+#define  NV01_IMAGE_FROM_CPU_NOP							0x00000100
+#define  NV01_IMAGE_FROM_CPU_NOTIFY							0x00000104
+#define  NV01_IMAGE_FROM_CPU_PATCH							0x0000010c
+#define  NV01_IMAGE_FROM_CPU_DMA_NOTIFY							0x00000180
+#define  NV01_IMAGE_FROM_CPU_COLOR_KEY							0x00000184
+#define  NV01_IMAGE_FROM_CPU_CLIP_RECTANGLE						0x00000188
+#define  NV01_IMAGE_FROM_CPU_PATTERN							0x0000018c
+#define  NV01_IMAGE_FROM_CPU_ROP							0x00000190
+#define  NV01_IMAGE_FROM_CPU_BETA1							0x00000194
+#define  NV01_IMAGE_FROM_CPU_SURFACE							0x00000198
+#define  NV01_IMAGE_FROM_CPU_OPERATION							0x000002fc
+#define   NV01_IMAGE_FROM_CPU_OPERATION_SRCCOPY_AND					0x00000000
+#define   NV01_IMAGE_FROM_CPU_OPERATION_ROP_AND						0x00000001
+#define   NV01_IMAGE_FROM_CPU_OPERATION_BLEND_AND					0x00000002
+#define   NV01_IMAGE_FROM_CPU_OPERATION_SRCCOPY						0x00000003
+#define   NV01_IMAGE_FROM_CPU_OPERATION_SRCCOPY_PREMULT					0x00000004
+#define   NV01_IMAGE_FROM_CPU_OPERATION_BLEND_PREMULT					0x00000005
+#define  NV01_IMAGE_FROM_CPU_COLOR_FORMAT						0x00000300
+#define   NV01_IMAGE_FROM_CPU_COLOR_FORMAT_Y8						0x00000001
+#define   NV01_IMAGE_FROM_CPU_COLOR_FORMAT_A1R5G5B5					0x00000002
+#define   NV01_IMAGE_FROM_CPU_COLOR_FORMAT_X1R5G5B5					0x00000003
+#define   NV01_IMAGE_FROM_CPU_COLOR_FORMAT_A8R8G8B8					0x00000004
+#define   NV01_IMAGE_FROM_CPU_COLOR_FORMAT_X8R8G8B8					0x00000005
+#define  NV01_IMAGE_FROM_CPU_POINT							0x00000304
+#define   NV01_IMAGE_FROM_CPU_POINT_X_SHIFT						0
+#define   NV01_IMAGE_FROM_CPU_POINT_X_MASK						0x0000ffff
+#define   NV01_IMAGE_FROM_CPU_POINT_Y_SHIFT						16
+#define   NV01_IMAGE_FROM_CPU_POINT_Y_MASK						0xffff0000
+#define  NV01_IMAGE_FROM_CPU_SIZE_OUT							0x00000308
+#define   NV01_IMAGE_FROM_CPU_SIZE_OUT_W_SHIFT						0
+#define   NV01_IMAGE_FROM_CPU_SIZE_OUT_W_MASK						0x0000ffff
+#define   NV01_IMAGE_FROM_CPU_SIZE_OUT_H_SHIFT						16
+#define   NV01_IMAGE_FROM_CPU_SIZE_OUT_H_MASK						0xffff0000
+#define  NV01_IMAGE_FROM_CPU_SIZE_IN							0x0000030c
+#define   NV01_IMAGE_FROM_CPU_SIZE_IN_W_SHIFT						0
+#define   NV01_IMAGE_FROM_CPU_SIZE_IN_W_MASK						0x0000ffff
+#define   NV01_IMAGE_FROM_CPU_SIZE_IN_H_SHIFT						16
+#define   NV01_IMAGE_FROM_CPU_SIZE_IN_H_MASK						0xffff0000
+#define  NV01_IMAGE_FROM_CPU_COLOR(x)							(0x00000400+((x)*4))
+#define  NV01_IMAGE_FROM_CPU_COLOR__SIZE						0x00000020
+
+
+#define NV01_NULL									0x00000030
+
+
+
+#define NV03_STRETCHED_IMAGE_FROM_CPU							0x00000036
+
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_NOP						0x00000100
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_NOTIFY						0x00000104
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_PATCH						0x0000010c
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_DMA_NOTIFY					0x00000180
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_COLOR_KEY					0x00000184
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_PATTERN						0x00000188
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_ROP						0x0000018c
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_BETA1						0x00000190
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_SURFACE						0x00000194
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_OPERATION					0x000002fc
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_COLOR_FORMAT					0x00000300
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_SIZE_IN						0x00000304
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_SIZE_IN_W_SHIFT					0
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_SIZE_IN_W_MASK					0x0000ffff
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_SIZE_IN_H_SHIFT					16
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_SIZE_IN_H_MASK					0xffff0000
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_DX_DU						0x00000308
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_DY_DV						0x0000030c
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_POINT					0x00000310
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_POINT_X_SHIFT				0
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_POINT_X_MASK				0x0000ffff
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_POINT_Y_SHIFT				16
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_POINT_Y_MASK				0xffff0000
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_SIZE					0x00000314
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_SIZE_W_SHIFT				0
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_SIZE_W_MASK				0x0000ffff
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_SIZE_H_SHIFT				16
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_SIZE_H_MASK				0xffff0000
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_POINT12D4					0x00000318
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_POINT12D4_X_SHIFT				0
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_POINT12D4_X_MASK				0x0000ffff
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_POINT12D4_Y_SHIFT				16
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_POINT12D4_Y_MASK				0xffff0000
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_COLOR(x)						(0x00000400+((x)*4))
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_COLOR__SIZE					0x00000020
+
+
+#define NV03_SCALED_IMAGE_FROM_MEMORY							0x00000037
+
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_NOP						0x00000100
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_NOTIFY						0x00000104
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_DMA_NOTIFY					0x00000180
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_DMA_IMAGE					0x00000184
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_PATTERN						0x00000188
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_ROP						0x0000018c
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_BETA1						0x00000190
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_SURFACE						0x00000194
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT					0x00000300
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A1R5G5B5				0x00000001
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_X1R5G5B5				0x00000002
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A8R8G8B8				0x00000003
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_X8R8G8B8				0x00000004
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_V8YB8U8YA8				0x00000005
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_YB8V8YA8U8				0x00000006
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_R5G6B5				0x00000007
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_Y8					0x00000008
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_AY8				0x00000009
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION					0x00000304
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY_AND				0x00000000
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_ROP_AND				0x00000001
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_BLEND_AND				0x00000002
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY				0x00000003
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY_PREMULT			0x00000004
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_BLEND_PREMULT				0x00000005
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT					0x00000308
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_X_SHIFT				0
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_X_MASK				0x0000ffff
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y_SHIFT				16
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y_MASK				0xffff0000
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE					0x0000030c
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_W_SHIFT				0
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_W_MASK				0x0000ffff
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_SHIFT				16
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_MASK				0xffff0000
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_OUT_POINT					0x00000310
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_OUT_POINT_X_SHIFT				0
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_OUT_POINT_X_MASK				0x0000ffff
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_OUT_POINT_Y_SHIFT				16
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_OUT_POINT_Y_MASK				0xffff0000
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_OUT_SIZE					0x00000314
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_OUT_SIZE_W_SHIFT				0
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_OUT_SIZE_W_MASK				0x0000ffff
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_OUT_SIZE_H_SHIFT				16
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_OUT_SIZE_H_MASK				0xffff0000
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_DELTA_DU_DX					0x00000318
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_DELTA_DV_DY					0x0000031c
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_SIZE					0x00000400
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_SIZE_W_SHIFT				0
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_SIZE_W_MASK				0x0000ffff
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_SIZE_H_SHIFT				16
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_SIZE_H_MASK				0xffff0000
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_FORMAT					0x00000404
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_FORMAT_PITCH_SHIFT			0
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_FORMAT_PITCH_MASK			0x0000ffff
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_FORMAT_ORIGIN_SHIFT			16
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_FORMAT_ORIGIN_MASK			0x00ff0000
+#define    NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_FORMAT_ORIGIN_CENTER			0x00010000
+#define    NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_FORMAT_ORIGIN_CORNER			0x00020000
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_FORMAT_INTERPOLATOR_SHIFT		24
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_FORMAT_INTERPOLATOR_MASK		0xff000000
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_OFFSET					0x00000408
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_POINT					0x0000040c
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_POINT_U_SHIFT				0
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_POINT_U_MASK				0x0000ffff
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_POINT_V_SHIFT				16
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_POINT_V_MASK				0xffff0000
+
+
+#define NV04_DVD_SUBPICTURE								0x00000038
+
+#define  NV04_DVD_SUBPICTURE_NOP							0x00000100
+#define  NV04_DVD_SUBPICTURE_NOTIFY							0x00000104
+#define  NV04_DVD_SUBPICTURE_WAIT_FOR_IDLE						0x00000108
+#define  NV04_DVD_SUBPICTURE_DMA_NOTIFY							0x00000180
+#define  NV04_DVD_SUBPICTURE_DMA_OVERLAY						0x00000184
+#define  NV04_DVD_SUBPICTURE_DMA_IMAGEIN						0x00000188
+#define  NV04_DVD_SUBPICTURE_DMA_IMAGEOUT						0x0000018c
+#define  NV04_DVD_SUBPICTURE_IMAGEOUT_POINT						0x00000300
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_POINT_X_SHIFT					0
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_POINT_X_MASK					0x0000ffff
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_POINT_Y_SHIFT					16
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_POINT_Y_MASK					0xffff0000
+#define  NV04_DVD_SUBPICTURE_IMAGEOUT_SIZE						0x00000304
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_SIZE_W_SHIFT					0
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_SIZE_W_MASK					0x0000ffff
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_SIZE_H_SHIFT					16
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_SIZE_H_MASK					0xffff0000
+#define  NV04_DVD_SUBPICTURE_IMAGEOUT_FORMAT						0x00000308
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_FORMAT_PITCH_SHIFT				0
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_FORMAT_PITCH_MASK				0x0000ffff
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_FORMAT_COLOR_SHIFT				16
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_FORMAT_COLOR_MASK				0xffff0000
+#define  NV04_DVD_SUBPICTURE_IMAGEOUT_OFFSET						0x0000030c
+#define  NV04_DVD_SUBPICTURE_IMAGEIN_DELTA_DU_DX					0x00000310
+#define  NV04_DVD_SUBPICTURE_IMAGEIN_DELTA_DV_DY					0x00000314
+#define  NV04_DVD_SUBPICTURE_IMAGEIN_SIZE						0x00000318
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_SIZE_W_SHIFT					0
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_SIZE_W_MASK					0x0000ffff
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_SIZE_H_SHIFT					16
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_SIZE_H_MASK					0xffff0000
+#define  NV04_DVD_SUBPICTURE_IMAGEIN_FORMAT						0x0000031c
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_FORMAT_PITCH_SHIFT				0
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_FORMAT_PITCH_MASK					0x0000ffff
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_FORMAT_COLOR_SHIFT				16
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_FORMAT_COLOR_MASK					0xffff0000
+#define  NV04_DVD_SUBPICTURE_IMAGEIN_OFFSET						0x00000320
+#define  NV04_DVD_SUBPICTURE_IMAGEIN_POINT						0x00000324
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_POINT_U_SHIFT					0
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_POINT_U_MASK					0x0000ffff
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_POINT_V_SHIFT					16
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_POINT_V_MASK					0xffff0000
+#define  NV04_DVD_SUBPICTURE_OVERLAY_DELTA_DU_DX					0x00000328
+#define  NV04_DVD_SUBPICTURE_OVERLAY_DELTA_DV_DY					0x0000032c
+#define  NV04_DVD_SUBPICTURE_OVERLAY_SIZE						0x00000330
+#define   NV04_DVD_SUBPICTURE_OVERLAY_SIZE_W_SHIFT					0
+#define   NV04_DVD_SUBPICTURE_OVERLAY_SIZE_W_MASK					0x0000ffff
+#define   NV04_DVD_SUBPICTURE_OVERLAY_SIZE_H_SHIFT					16
+#define   NV04_DVD_SUBPICTURE_OVERLAY_SIZE_H_MASK					0xffff0000
+#define  NV04_DVD_SUBPICTURE_OVERLAY_FORMAT						0x00000334
+#define   NV04_DVD_SUBPICTURE_OVERLAY_FORMAT_PITCH_SHIFT				0
+#define   NV04_DVD_SUBPICTURE_OVERLAY_FORMAT_PITCH_MASK					0x0000ffff
+#define   NV04_DVD_SUBPICTURE_OVERLAY_FORMAT_COLOR_SHIFT				16
+#define   NV04_DVD_SUBPICTURE_OVERLAY_FORMAT_COLOR_MASK					0xffff0000
+#define  NV04_DVD_SUBPICTURE_OVERLAY_OFFSET						0x00000338
+#define  NV04_DVD_SUBPICTURE_OVERLAY_POINT						0x0000033c
+#define   NV04_DVD_SUBPICTURE_OVERLAY_POINT_U_SHIFT					0
+#define   NV04_DVD_SUBPICTURE_OVERLAY_POINT_U_MASK					0x0000ffff
+#define   NV04_DVD_SUBPICTURE_OVERLAY_POINT_V_SHIFT					16
+#define   NV04_DVD_SUBPICTURE_OVERLAY_POINT_V_MASK					0xffff0000
+
+
+#define NV04_MEMORY_TO_MEMORY_FORMAT							0x00000039
+
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_NOP						0x00000100
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_NOTIFY						0x00000104
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_DMA_NOTIFY					0x00000180
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_DMA_BUFFER_IN					0x00000184
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_DMA_BUFFER_OUT					0x00000188
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN						0x0000030c
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_OUT					0x00000310
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_PITCH_IN						0x00000314
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_PITCH_OUT						0x00000318
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_LINE_LENGTH_IN					0x0000031c
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_LINE_COUNT					0x00000320
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_FORMAT						0x00000324
+#define   NV04_MEMORY_TO_MEMORY_FORMAT_FORMAT_INPUT_INC_SHIFT				0
+#define   NV04_MEMORY_TO_MEMORY_FORMAT_FORMAT_INPUT_INC_MASK				0x0000000f
+#define   NV04_MEMORY_TO_MEMORY_FORMAT_FORMAT_OUTPUT_INC_SHIFT				8
+#define   NV04_MEMORY_TO_MEMORY_FORMAT_FORMAT_OUTPUT_INC_MASK				0x00000f00
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_BUF_NOTIFY					0x00000328
+
+
+#define NV01_MEMORY_LOCAL_BANKED							0x0000003d
+
+
+
+#define NV01_MAPPING_SYSTEM								0x0000003e
+
+
+
+#define NV03_MEMORY_LOCAL_CURSOR							0x0000003f
+
+
+
+#define NV01_MEMORY_LOCAL_LINEAR							0x00000040
+
+
+
+#define NV01_MAPPING_LOCAL								0x00000041
+
+
+
+#define NV04_CONTEXT_SURFACES_2D							0x00000042
+
+#define  NV04_CONTEXT_SURFACES_2D_NOP							0x00000100
+#define  NV04_CONTEXT_SURFACES_2D_NOTIFY						0x00000104
+#define  NV04_CONTEXT_SURFACES_2D_PM_TRIGGER						0x00000140
+#define  NV04_CONTEXT_SURFACES_2D_DMA_NOTIFY						0x00000180
+#define  NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE					0x00000184
+#define  NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_DESTIN					0x00000188
+#define  NV04_CONTEXT_SURFACES_2D_FORMAT						0x00000300
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_Y8						0x00000001
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_X1R5G5B5_Z1R5G5B5				0x00000002
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_X1R5G5B5_X1R5G5B5				0x00000003
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_R5G6B5					0x00000004
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_Y16						0x00000005
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_X8R8G8B8_Z8R8G8B8				0x00000006
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_X8R8G8B8_X8R8G8B8				0x00000007
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_X1A7R8G8B8_Z1A7R8G8B8				0x00000008
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_X1A7R8G8B8_X1A7R8G8B8				0x00000009
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_A8R8G8B8					0x0000000a
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_Y32						0x0000000b
+#define  NV04_CONTEXT_SURFACES_2D_PITCH							0x00000304
+#define   NV04_CONTEXT_SURFACES_2D_PITCH_SOURCE_SHIFT					0
+#define   NV04_CONTEXT_SURFACES_2D_PITCH_SOURCE_MASK					0x0000ffff
+#define   NV04_CONTEXT_SURFACES_2D_PITCH_DESTIN_SHIFT					16
+#define   NV04_CONTEXT_SURFACES_2D_PITCH_DESTIN_MASK					0xffff0000
+#define  NV04_CONTEXT_SURFACES_2D_OFFSET_SOURCE						0x00000308
+#define  NV04_CONTEXT_SURFACES_2D_OFFSET_DESTIN						0x0000030c
+
+
+#define NV03_CONTEXT_ROP								0x00000043
+
+#define  NV03_CONTEXT_ROP_NOP								0x00000100
+#define  NV03_CONTEXT_ROP_NOTIFY							0x00000104
+#define  NV03_CONTEXT_ROP_DMA_NOTIFY							0x00000180
+#define  NV03_CONTEXT_ROP_ROP								0x00000300
+#define   NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_SHIFT					0
+#define   NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_MASK					0x0000000f
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_CLEAR					0x00000000
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_NOR					0x00000001
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_AND_INVERTED				0x00000002
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_COPY_INVERTED				0x00000003
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_AND_REVERSE				0x00000004
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_INVERT					0x00000005
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_XOR					0x00000006
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_NAND					0x00000007
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_AND					0x00000008
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_EQUI					0x00000009
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_NOOP					0x0000000a
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_OR_INVERTED				0x0000000b
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_COPY					0x0000000c
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_OR_REVERSE					0x0000000d
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_OR						0x0000000e
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_SET					0x0000000f
+#define   NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_SHIFT					4
+#define   NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_MASK					0x000000f0
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_CLEAR					0x00000000
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_NOR					0x00000010
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_AND_INVERTED				0x00000020
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_COPY_INVERTED				0x00000030
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_AND_REVERSE				0x00000040
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_INVERT					0x00000050
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_XOR					0x00000060
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_NAND					0x00000070
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_AND					0x00000080
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_EQUI					0x00000090
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_NOOP					0x000000a0
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_OR_INVERTED				0x000000b0
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_COPY					0x000000c0
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_OR_REVERSE					0x000000d0
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_OR						0x000000e0
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_SET					0x000000f0
+
+
+#define NV04_IMAGE_PATTERN								0x00000044
+
+#define  NV04_IMAGE_PATTERN_NOP								0x00000100
+#define  NV04_IMAGE_PATTERN_NOTIFY							0x00000104
+#define  NV04_IMAGE_PATTERN_DMA_NOTIFY							0x00000180
+#define  NV04_IMAGE_PATTERN_COLOR_FORMAT						0x00000300
+#define   NV04_IMAGE_PATTERN_COLOR_FORMAT_A16R5G6B5					0x00000001
+#define   NV04_IMAGE_PATTERN_COLOR_FORMAT_X16A1R5G5B5					0x00000002
+#define   NV04_IMAGE_PATTERN_COLOR_FORMAT_A8R8G8B8					0x00000003
+#define  NV04_IMAGE_PATTERN_MONOCHROME_FORMAT						0x00000304
+#define   NV04_IMAGE_PATTERN_MONOCHROME_FORMAT_CGA6					0x00000001
+#define   NV04_IMAGE_PATTERN_MONOCHROME_FORMAT_LE					0x00000002
+#define  NV04_IMAGE_PATTERN_MONOCHROME_SHAPE						0x00000308
+#define   NV04_IMAGE_PATTERN_MONOCHROME_SHAPE_8X8					0x00000000
+#define   NV04_IMAGE_PATTERN_MONOCHROME_SHAPE_64X1					0x00000001
+#define   NV04_IMAGE_PATTERN_MONOCHROME_SHAPE_1X64					0x00000002
+#define  NV04_IMAGE_PATTERN_PATTERN_SELECT						0x0000030c
+#define   NV04_IMAGE_PATTERN_PATTERN_SELECT_MONO					0x00000001
+#define   NV04_IMAGE_PATTERN_PATTERN_SELECT_COLOR					0x00000002
+#define  NV04_IMAGE_PATTERN_MONOCHROME_COLOR0						0x00000310
+#define  NV04_IMAGE_PATTERN_MONOCHROME_COLOR1						0x00000314
+#define  NV04_IMAGE_PATTERN_MONOCHROME_PATTERN0						0x00000318
+#define  NV04_IMAGE_PATTERN_MONOCHROME_PATTERN1						0x0000031c
+#define  NV04_IMAGE_PATTERN_PATTERN_Y8(x)						(0x00000400+((x)*4))
+#define  NV04_IMAGE_PATTERN_PATTERN_Y8__SIZE						0x00000010
+#define   NV04_IMAGE_PATTERN_PATTERN_Y8_Y0_SHIFT					0
+#define   NV04_IMAGE_PATTERN_PATTERN_Y8_Y0_MASK						0x000000ff
+#define   NV04_IMAGE_PATTERN_PATTERN_Y8_Y1_SHIFT					8
+#define   NV04_IMAGE_PATTERN_PATTERN_Y8_Y1_MASK						0x0000ff00
+#define   NV04_IMAGE_PATTERN_PATTERN_Y8_Y2_SHIFT					16
+#define   NV04_IMAGE_PATTERN_PATTERN_Y8_Y2_MASK						0x00ff0000
+#define   NV04_IMAGE_PATTERN_PATTERN_Y8_Y3_SHIFT					24
+#define   NV04_IMAGE_PATTERN_PATTERN_Y8_Y3_MASK						0xff000000
+#define  NV04_IMAGE_PATTERN_PATTERN_R5G6B5(x)						(0x00000500+((x)*4))
+#define  NV04_IMAGE_PATTERN_PATTERN_R5G6B5__SIZE					0x00000020
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_B0_SHIFT					0
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_B0_MASK					0x0000001f
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_G0_SHIFT					5
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_G0_MASK					0x000007e0
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_R0_SHIFT					11
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_R0_MASK					0x0000f800
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_B1_SHIFT					16
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_B1_MASK					0x001f0000
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_G1_SHIFT					21
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_G1_MASK					0x07e00000
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_R1_SHIFT					27
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_R1_MASK					0xf8000000
+#define  NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5(x)						(0x00000600+((x)*4))
+#define  NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5__SIZE					0x00000020
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_B0_SHIFT					0
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_B0_MASK					0x0000001f
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_G0_SHIFT					5
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_G0_MASK					0x000003e0
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_R0_SHIFT					10
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_R0_MASK					0x00007c00
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_B1_SHIFT					16
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_B1_MASK					0x001f0000
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_G1_SHIFT					21
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_G1_MASK					0x03e00000
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_R1_SHIFT					26
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_R1_MASK					0x7c000000
+#define  NV04_IMAGE_PATTERN_PATTERN_X8R8G8B8(x)						(0x00000700+((x)*4))
+#define  NV04_IMAGE_PATTERN_PATTERN_X8R8G8B8__SIZE					0x00000040
+#define   NV04_IMAGE_PATTERN_PATTERN_X8R8G8B8_B_SHIFT					0
+#define   NV04_IMAGE_PATTERN_PATTERN_X8R8G8B8_B_MASK					0x000000ff
+#define   NV04_IMAGE_PATTERN_PATTERN_X8R8G8B8_G_SHIFT					8
+#define   NV04_IMAGE_PATTERN_PATTERN_X8R8G8B8_G_MASK					0x0000ff00
+#define   NV04_IMAGE_PATTERN_PATTERN_X8R8G8B8_R_SHIFT					16
+#define   NV04_IMAGE_PATTERN_PATTERN_X8R8G8B8_R_MASK					0x00ff0000
+
+
+#define NV03_VIDEO_LUT_CURSOR_DAC							0x00000046
+
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SYNCHRONIZE						0x00000100
+#define  NV03_VIDEO_LUT_CURSOR_DAC_STOP_IMAGE						0x00000104
+#define  NV03_VIDEO_LUT_CURSOR_DAC_STOP_CURSOR						0x00000108
+#define  NV03_VIDEO_LUT_CURSOR_DAC_STOP_DAC						0x0000010c
+#define  NV03_VIDEO_LUT_CURSOR_DAC_DMA_NOTIFY						0x00000180
+#define  NV03_VIDEO_LUT_CURSOR_DAC_DMA_IMAGE(x)						(0x00000184+((x)*4))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_DMA_IMAGE__SIZE					0x00000002
+#define  NV03_VIDEO_LUT_CURSOR_DAC_DMA_LUT(x)						(0x0000018c+((x)*4))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_DMA_LUT__SIZE					0x00000002
+#define  NV03_VIDEO_LUT_CURSOR_DAC_DMA_CURSOR(x)					(0x00000194+((x)*4))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_DMA_CURSOR__SIZE					0x00000002
+#define  NV03_VIDEO_LUT_CURSOR_DAC_GET							0x000002fc
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_OFFSET(x)					(0x00000300+((x)*8))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_OFFSET__SIZE				0x00000002
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_FORMAT(x)					(0x00000304+((x)*8))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_FORMAT__SIZE				0x00000002
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_FORMAT_PITCH_SHIFT			0
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_FORMAT_PITCH_MASK				0x0000ffff
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_FORMAT_COLOR_SHIFT			16
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_FORMAT_COLOR_MASK				0x0fff0000
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_FORMAT_NOTIFY_SHIFT			28
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_FORMAT_NOTIFY_MASK			0xf0000000
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_OFFSET(x)					(0x00000340+((x)*12))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_OFFSET__SIZE				0x00000002
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT(x)				(0x00000344+((x)*12))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT__SIZE				0x00000002
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_X_SHIFT			0
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_X_MASK				0x0000ffff
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_Y_SHIFT			16
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_Y_MASK				0xffff0000
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_FORMAT(x)					(0x00000348+((x)*12))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_FORMAT__SIZE				0x00000002
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_A				0x00000358
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_A_X_SHIFT			0
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_A_X_MASK			0x0000ffff
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_A_Y_SHIFT			16
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_A_Y_MASK			0xffff0000
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_IMAGE_SIZE(x)				(0x00000380+((x)*16))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_IMAGE_SIZE__SIZE				0x00000002
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_IMAGE_SIZE_W_SHIFT				0
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_IMAGE_SIZE_W_MASK				0x0000ffff
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_IMAGE_SIZE_H_SHIFT				16
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_IMAGE_SIZE_H_MASK				0xffff0000
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_HSYNC(x)					(0x00000384+((x)*16))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_HSYNC__SIZE					0x00000002
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_HSYNC_START_SHIFT				0
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_HSYNC_START_MASK				0x0000ffff
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_HSYNC_WIDTH_SHIFT				16
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_HSYNC_WIDTH_MASK				0x0fff0000
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_HSYNC_POLARITY_SHIFT			28
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_HSYNC_POLARITY_MASK				0xf0000000
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_VSYNC(x)					(0x00000388+((x)*16))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_VSYNC__SIZE					0x00000002
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_VSYNC_START_SHIFT				0
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_VSYNC_START_MASK				0x0000ffff
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_VSYNC_WIDTH_SHIFT				16
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_VSYNC_WIDTH_MASK				0x0fff0000
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_VSYNC_POLARITY_SHIFT			28
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_VSYNC_POLARITY_MASK				0xf0000000
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_TOTAL_SIZE(x)				(0x0000038c+((x)*16))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_TOTAL_SIZE__SIZE				0x00000002
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_TOTAL_SIZE_WIDTH_SHIFT			0
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_TOTAL_SIZE_WIDTH_MASK			0x0000ffff
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_TOTAL_SIZE_HEIGHT_SHIFT			16
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_TOTAL_SIZE_HEIGHT_MASK			0x0fff0000
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_TOTAL_SIZE_NOTIFY_SHIFT			28
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_TOTAL_SIZE_NOTIFY_MASK			0xf0000000
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_PIXEL_CLOCK					0x000003a0
+
+
+#define NV03_DX3_TEXTURED_TRIANGLE							0x00000048
+
+#define  NV03_DX3_TEXTURED_TRIANGLE_NOP							0x00000100
+#define  NV03_DX3_TEXTURED_TRIANGLE_NOTIFY						0x00000104
+#define  NV03_DX3_TEXTURED_TRIANGLE_PATCH						0x0000010c
+#define  NV03_DX3_TEXTURED_TRIANGLE_DMA_NOTIFY						0x00000180
+#define  NV03_DX3_TEXTURED_TRIANGLE_DMA_TEXTURE						0x00000184
+#define  NV03_DX3_TEXTURED_TRIANGLE_CLIP_RECTANGLE					0x00000188
+#define  NV03_DX3_TEXTURED_TRIANGLE_SURFACE						0x0000018c
+#define  NV03_DX3_TEXTURED_TRIANGLE_TEXTURE_OFFSET					0x00000304
+#define  NV03_DX3_TEXTURED_TRIANGLE_TEXTURE_FORMAT					0x00000308
+#define   NV03_DX3_TEXTURED_TRIANGLE_TEXTURE_FORMAT_COLOR_KEY_MASK_SHIFT		0
+#define   NV03_DX3_TEXTURED_TRIANGLE_TEXTURE_FORMAT_COLOR_KEY_MASK_MASK			0x0000ffff
+#define   NV03_DX3_TEXTURED_TRIANGLE_TEXTURE_FORMAT_COLOR_KEY_ENABLE_SHIFT		16
+#define   NV03_DX3_TEXTURED_TRIANGLE_TEXTURE_FORMAT_COLOR_KEY_ENABLE_MASK		0x000f0000
+#define   NV03_DX3_TEXTURED_TRIANGLE_TEXTURE_FORMAT_COLOR_SHIFT				20
+#define   NV03_DX3_TEXTURED_TRIANGLE_TEXTURE_FORMAT_COLOR_MASK				0x00f00000
+#define   NV03_DX3_TEXTURED_TRIANGLE_TEXTURE_FORMAT_SIZE_MIN_SHIFT			24
+#define   NV03_DX3_TEXTURED_TRIANGLE_TEXTURE_FORMAT_SIZE_MIN_MASK			0x0f000000
+#define   NV03_DX3_TEXTURED_TRIANGLE_TEXTURE_FORMAT_SIZE_MAX_SHIFT			28
+#define   NV03_DX3_TEXTURED_TRIANGLE_TEXTURE_FORMAT_SIZE_MAX_MASK			0xf0000000
+#define  NV03_DX3_TEXTURED_TRIANGLE_FILTER						0x0000030c
+#define   NV03_DX3_TEXTURED_TRIANGLE_FILTER_SPREAD_X_SHIFT				0
+#define   NV03_DX3_TEXTURED_TRIANGLE_FILTER_SPREAD_X_MASK				0x0000001f
+#define   NV03_DX3_TEXTURED_TRIANGLE_FILTER_SPREAD_Y_SHIFT				8
+#define   NV03_DX3_TEXTURED_TRIANGLE_FILTER_SPREAD_Y_MASK				0x00001f00
+#define   NV03_DX3_TEXTURED_TRIANGLE_FILTER_SIZE_ADJUST_SHIFT				16
+#define   NV03_DX3_TEXTURED_TRIANGLE_FILTER_SIZE_ADJUST_MASK				0x00ff0000
+#define  NV03_DX3_TEXTURED_TRIANGLE_FOG_COLOR						0x00000310
+#define   NV03_DX3_TEXTURED_TRIANGLE_FOG_COLOR_B_SHIFT					0
+#define   NV03_DX3_TEXTURED_TRIANGLE_FOG_COLOR_B_MASK					0x000000ff
+#define   NV03_DX3_TEXTURED_TRIANGLE_FOG_COLOR_G_SHIFT					8
+#define   NV03_DX3_TEXTURED_TRIANGLE_FOG_COLOR_G_MASK					0x0000ff00
+#define   NV03_DX3_TEXTURED_TRIANGLE_FOG_COLOR_R_SHIFT					16
+#define   NV03_DX3_TEXTURED_TRIANGLE_FOG_COLOR_R_MASK					0x00ff0000
+#define  NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT						0x00000314
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_INTERPOLATOR_SHIFT			0
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_INTERPOLATOR_MASK			0x0000000f
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_WRAP_U_SHIFT				4
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_WRAP_U_MASK				0x00000030
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_WRAP_V_SHIFT				6
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_WRAP_V_MASK				0x000000c0
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_SOURCE_COLOR_SHIFT			8
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_SOURCE_COLOR_MASK			0x00000f00
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_CULLING_SHIFT				12
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_CULLING_MASK				0x00007000
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_Z_PERSPECTIVE_ENABLE			(1 << 15)
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_Z_FUNC_SHIFT				16
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_Z_FUNC_MASK				0x000f0000
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_Z_WRITE_ENABLE_SHIFT			20
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_Z_WRITE_ENABLE_MASK			0x00f00000
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_COLOR_WRITE_ENABLE_SHIFT		24
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_COLOR_WRITE_ENABLE_MASK		0x07000000
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_ROP_SHIFT				27
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_ROP_MASK				0x18000000
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_BETA					(1 << 29)
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_DST_BLEND				(1 << 30)
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_SRC_BLEND				(1 << 31)
+#define  NV03_DX3_TEXTURED_TRIANGLE_ALPHA_CONTROL					0x00000318
+#define   NV03_DX3_TEXTURED_TRIANGLE_ALPHA_CONTROL_ALPHA_REF_SHIFT			0
+#define   NV03_DX3_TEXTURED_TRIANGLE_ALPHA_CONTROL_ALPHA_REF_MASK			0x000000ff
+#define   NV03_DX3_TEXTURED_TRIANGLE_ALPHA_CONTROL_ALPHA_FUNC_SHIFT			8
+#define   NV03_DX3_TEXTURED_TRIANGLE_ALPHA_CONTROL_ALPHA_FUNC_MASK			0xffffff00
+#define  NV03_DX3_TEXTURED_TRIANGLE_SPECULAR(x)						(0x00001000+((x)*32))
+#define  NV03_DX3_TEXTURED_TRIANGLE_SPECULAR__SIZE					0x00000040
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_I0_SHIFT					0
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_I0_MASK					0x0000000f
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_I1_SHIFT					4
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_I1_MASK					0x000000f0
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_I2_SHIFT					8
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_I2_MASK					0x00000f00
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_I3_SHIFT					12
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_I3_MASK					0x0000f000
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_I4_SHIFT					16
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_I4_MASK					0x000f0000
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_I5_SHIFT					20
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_I5_MASK					0x00f00000
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_FOG_SHIFT					24
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_FOG_MASK					0xff000000
+#define  NV03_DX3_TEXTURED_TRIANGLE_COLOR(x)						(0x00001004+((x)*32))
+#define  NV03_DX3_TEXTURED_TRIANGLE_COLOR__SIZE						0x00000040
+#define  NV03_DX3_TEXTURED_TRIANGLE_X(x)						(0x00001008+((x)*32))
+#define  NV03_DX3_TEXTURED_TRIANGLE_X__SIZE						0x00000040
+#define  NV03_DX3_TEXTURED_TRIANGLE_Y(x)						(0x0000100c+((x)*32))
+#define  NV03_DX3_TEXTURED_TRIANGLE_Y__SIZE						0x00000040
+#define  NV03_DX3_TEXTURED_TRIANGLE_Z(x)						(0x00001010+((x)*32))
+#define  NV03_DX3_TEXTURED_TRIANGLE_Z__SIZE						0x00000040
+#define  NV03_DX3_TEXTURED_TRIANGLE_M(x)						(0x00001014+((x)*32))
+#define  NV03_DX3_TEXTURED_TRIANGLE_M__SIZE						0x00000040
+#define  NV03_DX3_TEXTURED_TRIANGLE_U(x)						(0x00001018+((x)*32))
+#define  NV03_DX3_TEXTURED_TRIANGLE_U__SIZE						0x00000040
+#define  NV03_DX3_TEXTURED_TRIANGLE_V(x)						(0x0000101c+((x)*32))
+#define  NV03_DX3_TEXTURED_TRIANGLE_V__SIZE						0x00000040
+
+
+#define NV04_GDI_RECTANGLE_TEXT								0x0000004a
+
+#define  NV04_GDI_RECTANGLE_TEXT_NOP							0x00000100
+#define  NV04_GDI_RECTANGLE_TEXT_NOTIFY							0x00000104
+#define  NV04_GDI_RECTANGLE_TEXT_PATCH							0x0000010c
+#define  NV04_GDI_RECTANGLE_TEXT_PM_TRIGGER						0x00000140
+#define  NV04_GDI_RECTANGLE_TEXT_DMA_NOTIFY						0x00000180
+#define  NV04_GDI_RECTANGLE_TEXT_DMA_FONTS						0x00000184
+#define  NV04_GDI_RECTANGLE_TEXT_PATTERN						0x00000188
+#define  NV04_GDI_RECTANGLE_TEXT_ROP							0x0000018c
+#define  NV04_GDI_RECTANGLE_TEXT_BETA1							0x00000190
+#define  NV04_GDI_RECTANGLE_TEXT_BETA4							0x00000194
+#define  NV04_GDI_RECTANGLE_TEXT_SURFACE						0x00000198
+#define  NV04_GDI_RECTANGLE_TEXT_OPERATION						0x000002fc
+#define   NV04_GDI_RECTANGLE_TEXT_OPERATION_SRCCOPY_AND					0x00000000
+#define   NV04_GDI_RECTANGLE_TEXT_OPERATION_ROP_AND					0x00000001
+#define   NV04_GDI_RECTANGLE_TEXT_OPERATION_BLEND_AND					0x00000002
+#define   NV04_GDI_RECTANGLE_TEXT_OPERATION_SRCCOPY					0x00000003
+#define   NV04_GDI_RECTANGLE_TEXT_OPERATION_SRCCOPY_PREMULT				0x00000004
+#define   NV04_GDI_RECTANGLE_TEXT_OPERATION_BLEND_PREMULT				0x00000005
+#define  NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT						0x00000300
+#define   NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A16R5G6B5				0x00000001
+#define   NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_X16A1R5G5B5				0x00000002
+#define   NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8					0x00000003
+#define  NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT					0x00000304
+#define   NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT_CGA6				0x00000001
+#define   NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT_LE					0x00000002
+#define  NV04_GDI_RECTANGLE_TEXT_COLOR1_A						0x000003fc
+#define  NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT(x)				(0x00000400+((x)*8))
+#define  NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT__SIZE			0x00000020
+#define   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT_Y_SHIFT			0
+#define   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT_Y_MASK			0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT_X_SHIFT			16
+#define   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT_X_MASK			0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE(x)				(0x00000404+((x)*8))
+#define  NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE__SIZE				0x00000020
+#define   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE_H_SHIFT			0
+#define   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE_H_MASK			0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE_W_SHIFT			16
+#define   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE_W_MASK			0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT0						0x000005f4
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT0_L_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT0_L_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT0_T_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT0_T_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT1						0x000005f8
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT1_R_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT1_R_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT1_B_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT1_B_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_COLOR1_B						0x000005fc
+#define  NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0(x)				(0x00000600+((x)*8))
+#define  NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0__SIZE			0x00000020
+#define   NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0_L_SHIFT			0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0_L_MASK			0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0_T_SHIFT			16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0_T_MASK			0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1(x)				(0x00000604+((x)*8))
+#define  NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1__SIZE			0x00000020
+#define   NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1_R_SHIFT			0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1_R_MASK			0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1_B_SHIFT			16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1_B_MASK			0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT0						0x000007ec
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT0_L_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT0_L_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT0_T_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT0_T_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT1						0x000007f0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT1_R_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT1_R_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT1_B_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT1_B_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_COLOR1_C						0x000007f4
+#define  NV04_GDI_RECTANGLE_TEXT_SIZE_C							0x000007f8
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_C_W_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_C_W_MASK						0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_C_H_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_C_H_MASK						0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_POINT_C						0x000007fc
+#define   NV04_GDI_RECTANGLE_TEXT_POINT_C_X_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_POINT_C_X_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_POINT_C_Y_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_POINT_C_Y_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR1_C(x)					(0x00000800+((x)*4))
+#define  NV04_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR1_C__SIZE				0x00000080
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT0						0x00000be4
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT0_L_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT0_L_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT0_T_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT0_T_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT1						0x00000be8
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT1_R_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT1_R_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT1_B_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT1_B_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_COLOR0_E						0x00000bec
+#define  NV04_GDI_RECTANGLE_TEXT_COLOR1_E						0x00000bf0
+#define  NV04_GDI_RECTANGLE_TEXT_SIZE_IN_E						0x00000bf4
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_IN_E_W_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_IN_E_W_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_IN_E_H_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_IN_E_H_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_SIZE_OUT_E						0x00000bf8
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_OUT_E_W_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_OUT_E_W_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_OUT_E_H_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_OUT_E_H_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_POINT_E						0x00000bfc
+#define   NV04_GDI_RECTANGLE_TEXT_POINT_E_X_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_POINT_E_X_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_POINT_E_Y_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_POINT_E_Y_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR01_E(x)				(0x00000c00+((x)*4))
+#define  NV04_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR01_E__SIZE				0x00000080
+#define  NV04_GDI_RECTANGLE_TEXT_FONT_F							0x00000ff0
+#define   NV04_GDI_RECTANGLE_TEXT_FONT_F_OFFSET_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_FONT_F_OFFSET_MASK					0x0fffffff
+#define   NV04_GDI_RECTANGLE_TEXT_FONT_F_PITCH_SHIFT					28
+#define   NV04_GDI_RECTANGLE_TEXT_FONT_F_PITCH_MASK					0xf0000000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT0						0x00000ff4
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT0_L_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT0_L_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT0_T_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT0_T_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT1						0x00000ff8
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT1_R_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT1_R_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT1_B_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT1_B_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_COLOR1_F						0x00000ffc
+#define  NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_F(x)					(0x00001000+((x)*4))
+#define  NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_F__SIZE				0x00000100
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_F_INDEX_SHIFT			0
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_F_INDEX_MASK				0x000000ff
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_F_X_SHIFT				8
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_F_X_MASK				0x000fff00
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_F_Y_SHIFT				20
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_F_Y_MASK				0xfff00000
+#define  NV04_GDI_RECTANGLE_TEXT_FONT_G							0x000017f0
+#define   NV04_GDI_RECTANGLE_TEXT_FONT_G_OFFSET_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_FONT_G_OFFSET_MASK					0x0fffffff
+#define   NV04_GDI_RECTANGLE_TEXT_FONT_G_PITCH_SHIFT					28
+#define   NV04_GDI_RECTANGLE_TEXT_FONT_G_PITCH_MASK					0xf0000000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT0						0x000017f4
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT0_L_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT0_L_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT0_T_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT0_T_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT1						0x000017f8
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT1_R_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT1_R_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT1_B_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT1_B_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_COLOR1_G						0x000017fc
+#define  NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_G_POINT(x)				(0x00001800+((x)*8))
+#define  NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_G_POINT__SIZE				0x00000100
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_G_POINT_X_SHIFT			0
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_G_POINT_X_MASK			0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_G_POINT_Y_SHIFT			16
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_G_POINT_Y_MASK			0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_G_INDEX(x)				(0x00001804+((x)*8))
+#define  NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_G_INDEX__SIZE				0x00000100
+
+
+#define NV03_GDI_RECTANGLE_TEXT								0x0000004b
+
+#define  NV03_GDI_RECTANGLE_TEXT_NOP							0x00000100
+#define  NV03_GDI_RECTANGLE_TEXT_NOTIFY							0x00000104
+#define  NV03_GDI_RECTANGLE_TEXT_DMA_NOTIFY						0x00000180
+#define  NV03_GDI_RECTANGLE_TEXT_PATTERN						0x00000184
+#define  NV03_GDI_RECTANGLE_TEXT_ROP							0x00000188
+#define  NV03_GDI_RECTANGLE_TEXT_BETA1							0x0000018c
+#define  NV03_GDI_RECTANGLE_TEXT_SURFACE						0x00000190
+#define  NV03_GDI_RECTANGLE_TEXT_OPERATION						0x000002fc
+#define  NV03_GDI_RECTANGLE_TEXT_COLOR_FORMAT						0x00000300
+#define  NV03_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT					0x00000304
+#define  NV03_GDI_RECTANGLE_TEXT_COLOR1_A						0x000003fc
+#define  NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT				0x00000400
+#define   NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT_Y_SHIFT			0
+#define   NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT_Y_MASK			0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT_X_SHIFT			16
+#define   NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT_X_MASK			0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE				0x00000404
+#define   NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE_H_SHIFT			0
+#define   NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE_H_MASK			0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE_W_SHIFT			16
+#define   NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE_W_MASK			0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_CLIP_POINT0_B						0x000007f4
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_POINT0_B_L_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_POINT0_B_L_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_POINT0_B_T_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_POINT0_B_T_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_CLIP_POINT1_B						0x000007f8
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_POINT1_B_R_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_POINT1_B_R_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_POINT1_B_B_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_POINT1_B_B_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_COLOR1_B						0x000007fc
+#define  NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0				0x00000800
+#define   NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0_L_SHIFT			0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0_L_MASK			0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0_T_SHIFT			16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0_T_MASK			0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1				0x00000804
+#define   NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1_R_SHIFT			0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1_R_MASK			0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1_B_SHIFT			16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1_B_MASK			0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT0						0x00000bec
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT0_L_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT0_L_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT0_T_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT0_T_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT1						0x00000bf0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT1_R_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT1_R_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT1_B_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT1_B_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_COLOR1_C						0x00000bf4
+#define  NV03_GDI_RECTANGLE_TEXT_SIZE_C							0x00000bf8
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_C_W_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_C_W_MASK						0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_C_H_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_C_H_MASK						0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_POINT_C						0x00000bfc
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_C_X_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_C_X_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_C_Y_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_C_Y_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR1_C(x)					(0x00000c00+((x)*4))
+#define  NV03_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR1_C__SIZE				0x00000020
+#define  NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT0						0x00000fe8
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT0_L_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT0_L_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT0_T_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT0_T_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT1						0x00000fec
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT1_R_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT1_R_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT1_B_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT1_B_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_COLOR1_D						0x00000ff0
+#define  NV03_GDI_RECTANGLE_TEXT_SIZE_IN_D						0x00000ff4
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_IN_D_W_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_IN_D_W_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_IN_D_H_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_IN_D_H_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_D						0x00000ff8
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_D_W_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_D_W_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_D_H_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_D_H_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_POINT_D						0x00000ffc
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_D_X_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_D_X_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_D_Y_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_D_Y_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR1_D(x)					(0x00001000+((x)*4))
+#define  NV03_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR1_D__SIZE				0x00000020
+#define  NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT0						0x000013e4
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT0_L_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT0_L_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT0_T_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT0_T_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT1						0x000013e8
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT1_R_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT1_R_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT1_B_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT1_B_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_COLOR0_E						0x000013ec
+#define  NV03_GDI_RECTANGLE_TEXT_COLOR1_E						0x000013f0
+#define  NV03_GDI_RECTANGLE_TEXT_SIZE_IN_E						0x000013f4
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_IN_E_W_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_IN_E_W_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_IN_E_H_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_IN_E_H_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_E						0x000013f8
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_E_W_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_E_W_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_E_H_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_E_H_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_POINT_E						0x000013fc
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_E_X_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_E_X_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_E_Y_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_E_Y_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR01_E(x)				(0x00001400+((x)*4))
+#define  NV03_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR01_E__SIZE				0x00000020
+
+
+#define NV04_SWIZZLED_SURFACE								0x00000052
+
+#define  NV04_SWIZZLED_SURFACE_NOP							0x00000100
+#define  NV04_SWIZZLED_SURFACE_NOTIFY							0x00000104
+#define  NV04_SWIZZLED_SURFACE_DMA_NOTIFY						0x00000180
+#define  NV04_SWIZZLED_SURFACE_DMA_IMAGE						0x00000184
+#define  NV04_SWIZZLED_SURFACE_FORMAT							0x00000300
+#define   NV04_SWIZZLED_SURFACE_FORMAT_COLOR_SHIFT					0
+#define   NV04_SWIZZLED_SURFACE_FORMAT_COLOR_MASK					0x000000ff
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_Y8					0x00000001
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_X1R5G5B5_Z1R5G5B5				0x00000002
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_X1R5G5B5_X1R5G5B5				0x00000003
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_R5G6B5					0x00000004
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_Y16					0x00000005
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_X8R8G8B8_Z8R8G8B8				0x00000006
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_X8R8G8B8_X8R8G8B8				0x00000007
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_X1A7R8G8B8_Z1A7R8G8B8			0x00000008
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_X1A7R8G8B8_X1A7R8G8B8			0x00000009
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_A8R8G8B8					0x0000000a
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_Y32					0x0000000b
+#define   NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_U_SHIFT				16
+#define   NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_U_MASK					0x00ff0000
+#define   NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_V_SHIFT				24
+#define   NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_V_MASK					0xff000000
+#define  NV04_SWIZZLED_SURFACE_OFFSET							0x00000304
+
+
+#define NV04_CONTEXT_SURFACES_3D							0x00000053
+
+#define  NV04_CONTEXT_SURFACES_3D_NOP							0x00000100
+#define  NV04_CONTEXT_SURFACES_3D_NOTIFY						0x00000104
+#define  NV04_CONTEXT_SURFACES_3D_DMA_NOTIFY						0x00000180
+#define  NV04_CONTEXT_SURFACES_3D_DMA_COLOR						0x00000184
+#define  NV04_CONTEXT_SURFACES_3D_DMA_ZETA						0x00000188
+#define  NV04_CONTEXT_SURFACES_3D_CLIP_HORIZONTAL					0x000002f8
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_HORIZONTAL_X_SHIFT				0
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_HORIZONTAL_X_MASK				0x0000ffff
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_HORIZONTAL_W_SHIFT				16
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_HORIZONTAL_W_MASK				0xffff0000
+#define  NV04_CONTEXT_SURFACES_3D_CLIP_VERTICAL						0x000002fc
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_VERTICAL_Y_SHIFT				0
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_VERTICAL_Y_MASK					0x0000ffff
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_VERTICAL_H_SHIFT				16
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_VERTICAL_H_MASK					0xffff0000
+#define  NV04_CONTEXT_SURFACES_3D_FORMAT						0x00000300
+#define   NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_SHIFT					0
+#define   NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_MASK					0x000000ff
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_X1R5G5B5_Z1R5G5B5			0x00000001
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_X1R5G5B5_X1R5G5B5			0x00000002
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_R5G6B5					0x00000003
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_X8R8G8B8_Z8R8G8B8			0x00000004
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_X8R8G8B8_X8R8G8B8			0x00000005
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_X1A7R8G8B8_Z1A7R8G8B8			0x00000006
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_X1A7R8G8B8_X1A7R8G8B8			0x00000007
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_A8R8G8B8				0x00000008
+#define   NV04_CONTEXT_SURFACES_3D_FORMAT_TYPE_SHIFT					8
+#define   NV04_CONTEXT_SURFACES_3D_FORMAT_TYPE_MASK					0x0000ff00
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_TYPE_PITCH					0x00000100
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_TYPE_SWIZZLE					0x00000200
+#define   NV04_CONTEXT_SURFACES_3D_FORMAT_BASE_SIZE_U_SHIFT				16
+#define   NV04_CONTEXT_SURFACES_3D_FORMAT_BASE_SIZE_U_MASK				0x00ff0000
+#define   NV04_CONTEXT_SURFACES_3D_FORMAT_BASE_SIZE_V_SHIFT				24
+#define   NV04_CONTEXT_SURFACES_3D_FORMAT_BASE_SIZE_V_MASK				0xff000000
+#define  NV04_CONTEXT_SURFACES_3D_CLIP_SIZE						0x00000304
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_SIZE_W_SHIFT					0
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_SIZE_W_MASK					0x0000ffff
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_SIZE_H_SHIFT					16
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_SIZE_H_MASK					0xffff0000
+#define  NV04_CONTEXT_SURFACES_3D_PITCH							0x00000308
+#define   NV04_CONTEXT_SURFACES_3D_PITCH_COLOR_SHIFT					0
+#define   NV04_CONTEXT_SURFACES_3D_PITCH_COLOR_MASK					0x0000ffff
+#define   NV04_CONTEXT_SURFACES_3D_PITCH_ZETA_SHIFT					16
+#define   NV04_CONTEXT_SURFACES_3D_PITCH_ZETA_MASK					0xffff0000
+#define  NV04_CONTEXT_SURFACES_3D_OFFSET_COLOR						0x0000030c
+#define  NV04_CONTEXT_SURFACES_3D_OFFSET_ZETA						0x00000310
+
+
+#define NV04_DX5_TEXTURED_TRIANGLE							0x00000054
+
+#define  NV04_DX5_TEXTURED_TRIANGLE_NOP							0x00000100
+#define  NV04_DX5_TEXTURED_TRIANGLE_NOTIFY						0x00000104
+#define  NV04_DX5_TEXTURED_TRIANGLE_DMA_NOTIFY						0x00000180
+#define  NV04_DX5_TEXTURED_TRIANGLE_DMA_A						0x00000184
+#define  NV04_DX5_TEXTURED_TRIANGLE_DMA_B						0x00000188
+#define  NV04_DX5_TEXTURED_TRIANGLE_SURFACE						0x0000018c
+#define  NV04_DX5_TEXTURED_TRIANGLE_COLORKEY						0x00000300
+#define  NV04_DX5_TEXTURED_TRIANGLE_OFFSET						0x00000304
+#define  NV04_DX5_TEXTURED_TRIANGLE_FORMAT						0x00000308
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_DMA_SHIFT					0
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_DMA_MASK					0x00000003
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_KEY_MATCH_SHIFT			2
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_KEY_MATCH_MASK			0x0000000c
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_ZOH_SHIFT				4
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_ZOH_MASK				0x00000030
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_ZOH_CENTER				0x00000010
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_ZOH_CORNER				0x00000020
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_FOH_SHIFT				6
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_FOH_MASK				0x000000c0
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_FOH_CENTER				0x00000040
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_FOH_CORNER				0x00000080
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_SHIFT					8
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_MASK					0x00000f00
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_Y8					0x00000100
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_A1R5G5B5				0x00000200
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_X1R5G5B5				0x00000300
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_A4R4G4B4				0x00000400
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_R5G6B5				0x00000500
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_A8R8G8B8				0x00000600
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_X8R8G8B8				0x00000700
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_MIPMAP_LEVELS_SHIFT				12
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_MIPMAP_LEVELS_MASK				0x0000f000
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_U_SHIFT				16
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_U_MASK				0x000f0000
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_V_SHIFT				20
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_V_MASK				0x00f00000
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_SHIFT				24
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_MASK				0x07000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_REPEAT				0x01000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_MIRRORED_REPEAT			0x02000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_EDGE			0x03000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_BORDER			0x04000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP				0x05000000
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_WRAPU					(1 << 27)
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_SHIFT				28
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_MASK				0x70000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_REPEAT				0x10000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_MIRRORED_REPEAT			0x20000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_CLAMP_TO_EDGE			0x30000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_CLAMP_TO_BORDER			0x40000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_CLAMP				0x50000000
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_WRAPV					(1 << 31)
+#define  NV04_DX5_TEXTURED_TRIANGLE_FILTER						0x0000030c
+#define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_KERNEL_SIZE_X_SHIFT				0
+#define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_KERNEL_SIZE_X_MASK				0x000000ff
+#define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_KERNEL_SIZE_Y_SHIFT				8
+#define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_KERNEL_SIZE_Y_MASK				0x00007f00
+#define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_MIPMAP_DITHER_ENABLE			(1 << 15)
+#define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_MIPMAP_LODBIAS_SHIFT			16
+#define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_MIPMAP_LODBIAS_MASK				0x00ff0000
+#define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_SHIFT				24
+#define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_MASK					0x07000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST				0x01000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR				0x02000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST		0x03000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST		0x04000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR		0x05000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR		0x06000000
+#define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_ANISOTROPIC_MINIFY_ENABLE			(1 << 27)
+#define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_SHIFT				28
+#define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_MASK				0x70000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_NEAREST				0x10000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_LINEAR				0x20000000
+#define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_ANISOTROPIC_MAGNIFY_ENABLE			(1 << 31)
+#define  NV04_DX5_TEXTURED_TRIANGLE_BLEND						0x00000310
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_TEXTURE_MAP_SHIFT				0
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_TEXTURE_MAP_MASK				0x0000000f
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_MASK_BIT_SHIFT				4
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_MASK_BIT_MASK				0x00000030
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_SHIFT				6
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_MASK				0x000000c0
+#define    NV04_DX5_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_FLAT				0x00000040
+#define    NV04_DX5_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_GOURAUD				0x00000080
+#define    NV04_DX5_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_PHONG				0x000000c0
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_TEXTURE_PERSPECTIVE_ENABLE_SHIFT		8
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_TEXTURE_PERSPECTIVE_ENABLE_MASK		0x00000f00
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_SPECULAR_ENABLE_SHIFT			12
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_SPECULAR_ENABLE_MASK				0x0000f000
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_FOG_ENABLE_SHIFT				16
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_FOG_ENABLE_MASK				0x000f0000
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_ALPHA_ENABLE_SHIFT				20
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_ALPHA_ENABLE_MASK				0x00f00000
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_SRC_SHIFT					24
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_SRC_MASK					0x0f000000
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_DST_SHIFT					28
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_DST_MASK					0xf0000000
+#define  NV04_DX5_TEXTURED_TRIANGLE_CONTROL						0x00000314
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ALPHA_REF_SHIFT				0
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ALPHA_REF_MASK				0x000000ff
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ALPHA_FUNC_SHIFT				8
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ALPHA_FUNC_MASK				0x00000f00
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ALPHA_TEST_ENABLE				(1 << 12)
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ORIGIN					(1 << 13)
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_ENABLE_SHIFT				14
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_ENABLE_MASK				0x0000c000
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_FUNC_SHIFT				16
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_FUNC_MASK				0x000f0000
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_CULL_MODE_SHIFT				20
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_CULL_MODE_MASK				0x00300000
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_DITHER_ENABLE				(1 << 22)
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_PERSPECTIVE_ENABLE			(1 << 23)
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_WRITE_ENABLE_SHIFT			24
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_WRITE_ENABLE_MASK			0x3f000000
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_FORMAT_SHIFT				30
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_FORMAT_MASK				0xc0000000
+#define  NV04_DX5_TEXTURED_TRIANGLE_FOGCOLOR						0x00000318
+#define   NV04_DX5_TEXTURED_TRIANGLE_FOGCOLOR_B_SHIFT					0
+#define   NV04_DX5_TEXTURED_TRIANGLE_FOGCOLOR_B_MASK					0x000000ff
+#define   NV04_DX5_TEXTURED_TRIANGLE_FOGCOLOR_G_SHIFT					8
+#define   NV04_DX5_TEXTURED_TRIANGLE_FOGCOLOR_G_MASK					0x0000ff00
+#define   NV04_DX5_TEXTURED_TRIANGLE_FOGCOLOR_R_SHIFT					16
+#define   NV04_DX5_TEXTURED_TRIANGLE_FOGCOLOR_R_MASK					0x00ff0000
+#define   NV04_DX5_TEXTURED_TRIANGLE_FOGCOLOR_A_SHIFT					24
+#define   NV04_DX5_TEXTURED_TRIANGLE_FOGCOLOR_A_MASK					0xff000000
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(x)					(0x00000400+((x)*32))
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX__SIZE					0x00000010
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SY(x)					(0x00000404+((x)*32))
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SY__SIZE					0x00000010
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SZ(x)					(0x00000408+((x)*32))
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SZ__SIZE					0x00000010
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_RHW(x)					(0x0000040c+((x)*32))
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_RHW__SIZE					0x00000010
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_COLOR(x)					(0x00000410+((x)*32))
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_COLOR__SIZE				0x00000010
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_COLOR_B_SHIFT				0
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_COLOR_B_MASK				0x000000ff
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_COLOR_G_SHIFT				8
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_COLOR_G_MASK				0x0000ff00
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_COLOR_R_SHIFT				16
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_COLOR_R_MASK				0x00ff0000
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_COLOR_A_SHIFT				24
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_COLOR_A_MASK				0xff000000
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR(x)				(0x00000414+((x)*32))
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR__SIZE				0x00000010
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_B_SHIFT				0
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_B_MASK				0x000000ff
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_G_SHIFT				8
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_G_MASK				0x0000ff00
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_R_SHIFT				16
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_R_MASK				0x00ff0000
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_FOG_SHIFT			24
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_FOG_MASK				0xff000000
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_TU(x)					(0x00000418+((x)*32))
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_TU__SIZE					0x00000010
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_TV(x)					(0x0000041c+((x)*32))
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_TV__SIZE					0x00000010
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE(x)				(0x00000600+((x)*4))
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE__SIZE			0x00000040
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE_I0_SHIFT			0
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE_I0_MASK			0x0000000f
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE_I1_SHIFT			4
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE_I1_MASK			0x000000f0
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE_I2_SHIFT			8
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE_I2_MASK			0x00000f00
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE_I3_SHIFT			12
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE_I3_MASK			0x0000f000
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE_I4_SHIFT			16
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE_I4_MASK			0x000f0000
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE_I5_SHIFT			20
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE_I5_MASK			0x00f00000
+
+
+#define NV04_DX6_MULTITEX_TRIANGLE							0x00000055
+
+#define  NV04_DX6_MULTITEX_TRIANGLE_NOP							0x00000100
+#define  NV04_DX6_MULTITEX_TRIANGLE_NOTIFY						0x00000104
+#define  NV04_DX6_MULTITEX_TRIANGLE_DMA_NOTIFY						0x00000180
+#define  NV04_DX6_MULTITEX_TRIANGLE_DMA_A						0x00000184
+#define  NV04_DX6_MULTITEX_TRIANGLE_DMA_B						0x00000188
+#define  NV04_DX6_MULTITEX_TRIANGLE_SURFACE						0x0000018c
+#define  NV04_DX6_MULTITEX_TRIANGLE_OFFSET(x)						(0x00000308+((x)*4))
+#define  NV04_DX6_MULTITEX_TRIANGLE_OFFSET__SIZE					0x00000002
+#define  NV04_DX6_MULTITEX_TRIANGLE_FORMAT(x)						(0x00000310+((x)*4))
+#define  NV04_DX6_MULTITEX_TRIANGLE_FORMAT__SIZE					0x00000002
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_DMA_SHIFT					0
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_DMA_MASK					0x0000000f
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_ORIGIN_ZOH_SHIFT				4
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_ORIGIN_ZOH_MASK				0x00000030
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_ORIGIN_FOH_SHIFT				6
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_ORIGIN_FOH_MASK				0x000000c0
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_COLOR_SHIFT					8
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_COLOR_MASK					0x00000f00
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_MIPMAP_LEVELS_SHIFT				12
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_MIPMAP_LEVELS_MASK				0x0000f000
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_BASE_SIZE_U_SHIFT				16
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_BASE_SIZE_U_MASK				0x000f0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_BASE_SIZE_V_SHIFT				20
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_BASE_SIZE_V_MASK				0x00f00000
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_ADDRESSU_SHIFT				24
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_ADDRESSU_MASK				0x07000000
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_WRAPU					(1 << 27)
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_ADDRESSV_SHIFT				28
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_ADDRESSV_MASK				0x70000000
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_WRAPV					(1 << 31)
+#define  NV04_DX6_MULTITEX_TRIANGLE_FILTER(x)						(0x00000318+((x)*4))
+#define  NV04_DX6_MULTITEX_TRIANGLE_FILTER__SIZE					0x00000002
+#define   NV04_DX6_MULTITEX_TRIANGLE_FILTER_KERNEL_SIZE_X_SHIFT				0
+#define   NV04_DX6_MULTITEX_TRIANGLE_FILTER_KERNEL_SIZE_X_MASK				0x000000ff
+#define   NV04_DX6_MULTITEX_TRIANGLE_FILTER_KERNEL_SIZE_Y_SHIFT				8
+#define   NV04_DX6_MULTITEX_TRIANGLE_FILTER_KERNEL_SIZE_Y_MASK				0x00007f00
+#define   NV04_DX6_MULTITEX_TRIANGLE_FILTER_MIPMAP_DITHER_ENABLE			(1 << 15)
+#define   NV04_DX6_MULTITEX_TRIANGLE_FILTER_MIPMAP_LODBIAS_SHIFT			16
+#define   NV04_DX6_MULTITEX_TRIANGLE_FILTER_MIPMAP_LODBIAS_MASK				0x00ff0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_FILTER_MINIFY_SHIFT				24
+#define   NV04_DX6_MULTITEX_TRIANGLE_FILTER_MINIFY_MASK					0x07000000
+#define   NV04_DX6_MULTITEX_TRIANGLE_FILTER_ANISOTROPIC_MINIFY_ENABLE			(1 << 27)
+#define   NV04_DX6_MULTITEX_TRIANGLE_FILTER_MAGNIFY_SHIFT				28
+#define   NV04_DX6_MULTITEX_TRIANGLE_FILTER_MAGNIFY_MASK				0x70000000
+#define   NV04_DX6_MULTITEX_TRIANGLE_FILTER_ANISOTROPIC_MAGNIFY_ENABLE			(1 << 31)
+#define  NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA					0x00000320
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_INVERSE0				(1 <<  0)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_ALPHA0				(1 <<  1)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_ARGUMENT0_SHIFT			2
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_ARGUMENT0_MASK			0x000000fc
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_INVERSE1				(1 <<  8)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_ALPHA1				(1 <<  9)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_ARGUMENT1_SHIFT			10
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_ARGUMENT1_MASK			0x0000fc00
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_INVERSE2				(1 << 16)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_ALPHA2				(1 << 17)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_ARGUMENT2_SHIFT			18
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_ARGUMENT2_MASK			0x00fc0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_INVERSE3				(1 << 24)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_ALPHA3				(1 << 25)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_ARGUMENT3_SHIFT			26
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_ARGUMENT3_MASK			0x1c000000
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_OPERATION_SHIFT			29
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_OPERATION_MASK			0xe0000000
+#define  NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR					0x00000324
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_INVERSE0				(1 <<  0)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_ALPHA0				(1 <<  1)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_ARGUMENT0_SHIFT			2
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_ARGUMENT0_MASK			0x000000fc
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_INVERSE1				(1 <<  8)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_ALPHA1				(1 <<  9)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_ARGUMENT1_SHIFT			10
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_ARGUMENT1_MASK			0x0000fc00
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_INVERSE2				(1 << 16)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_ALPHA2				(1 << 17)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_ARGUMENT2_SHIFT			18
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_ARGUMENT2_MASK			0x00fc0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_INVERSE3				(1 << 24)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_ALPHA3				(1 << 25)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_ARGUMENT3_SHIFT			26
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_ARGUMENT3_MASK			0x1c000000
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_OPERATION_SHIFT			29
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_OPERATION_MASK			0xe0000000
+#define  NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA					0x0000032c
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_INVERSE0				(1 <<  0)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_ALPHA0				(1 <<  1)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_ARGUMENT0_SHIFT			2
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_ARGUMENT0_MASK			0x000000fc
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_INVERSE1				(1 <<  8)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_ALPHA1				(1 <<  9)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_ARGUMENT1_SHIFT			10
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_ARGUMENT1_MASK			0x0000fc00
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_INVERSE2				(1 << 16)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_ALPHA2				(1 << 17)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_ARGUMENT2_SHIFT			18
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_ARGUMENT2_MASK			0x00fc0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_INVERSE3				(1 << 24)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_ALPHA3				(1 << 25)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_ARGUMENT3_SHIFT			26
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_ARGUMENT3_MASK			0x1c000000
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_OPERATION_SHIFT			29
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_OPERATION_MASK			0xe0000000
+#define  NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR					0x00000330
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_INVERSE0				(1 <<  0)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_ALPHA0				(1 <<  1)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_ARGUMENT0_SHIFT			2
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_ARGUMENT0_MASK			0x000000fc
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_INVERSE1				(1 <<  8)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_ALPHA1				(1 <<  9)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_ARGUMENT1_SHIFT			10
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_ARGUMENT1_MASK			0x0000fc00
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_INVERSE2				(1 << 16)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_ALPHA2				(1 << 17)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_ARGUMENT2_SHIFT			18
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_ARGUMENT2_MASK			0x00fc0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_INVERSE3				(1 << 24)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_ALPHA3				(1 << 25)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_ARGUMENT3_SHIFT			26
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_ARGUMENT3_MASK			0x1c000000
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_OPERATION_SHIFT			29
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_OPERATION_MASK			0xe0000000
+#define  NV04_DX6_MULTITEX_TRIANGLE_COMBINE_FACTOR					0x00000334
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_FACTOR_B_SHIFT				0
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_FACTOR_B_MASK				0x000000ff
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_FACTOR_G_SHIFT				8
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_FACTOR_G_MASK				0x0000ff00
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_FACTOR_R_SHIFT				16
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_FACTOR_R_MASK				0x00ff0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_FACTOR_A_SHIFT				24
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_FACTOR_A_MASK				0xff000000
+#define  NV04_DX6_MULTITEX_TRIANGLE_BLEND						0x00000338
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_MASK_BIT_SHIFT				4
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_MASK_BIT_MASK				0x00000030
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_SHADE_MODE_SHIFT				6
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_SHADE_MODE_MASK				0x000000c0
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_TEXTURE_PERSPECTIVE_ENABLE_SHIFT		8
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_TEXTURE_PERSPECTIVE_ENABLE_MASK		0x00000f00
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_SPECULAR_ENABLE_SHIFT			12
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_SPECULAR_ENABLE_MASK				0x0000f000
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_FOG_ENABLE_SHIFT				16
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_FOG_ENABLE_MASK				0x000f0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_ALPHA_ENABLE_SHIFT				20
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_ALPHA_ENABLE_MASK				0x00f00000
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_SRC_SHIFT					24
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_SRC_MASK					0x0f000000
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_DST_SHIFT					28
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_DST_MASK					0xf0000000
+#define  NV04_DX6_MULTITEX_TRIANGLE_CONTROL0						0x0000033c
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_ALPHA_REF_SHIFT				0
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_ALPHA_REF_MASK				0x000000ff
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_ALPHA_FUNC_SHIFT				8
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_ALPHA_FUNC_MASK				0x00000f00
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_ALPHA_TEST_ENABLE				(1 << 12)
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_ORIGIN					(1 << 13)
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_Z_ENABLE_SHIFT				14
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_Z_ENABLE_MASK				0x0000c000
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_Z_FUNC_SHIFT				16
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_Z_FUNC_MASK				0x000f0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_CULL_MODE_SHIFT				20
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_CULL_MODE_MASK				0x00300000
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_DITHER_ENABLE				(1 << 22)
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_Z_PERSPECTIVE_ENABLE			(1 << 23)
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_Z_WRITE_ENABLE				(1 << 24)
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_STENCIL_WRITE_ENABLE			(1 << 25)
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_ALPHA_WRITE_ENABLE			(1 << 26)
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_RED_WRITE_ENABLE				(1 << 27)
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_GREEN_WRITE_ENABLE			(1 << 28)
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_BLUE_WRITE_ENABLE				(1 << 29)
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_Z_FORMAT_SHIFT				30
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_Z_FORMAT_MASK				0xc0000000
+#define  NV04_DX6_MULTITEX_TRIANGLE_CONTROL1						0x00000340
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL1_STENCIL_TEST_ENABLE_SHIFT			0
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL1_STENCIL_TEST_ENABLE_MASK			0x0000000f
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL1_STENCIL_FUNC_SHIFT			4
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL1_STENCIL_FUNC_MASK				0x000000f0
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL1_STENCIL_REF_SHIFT				8
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL1_STENCIL_REF_MASK				0x0000ff00
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL1_STENCIL_MASK_READ_SHIFT			16
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL1_STENCIL_MASK_READ_MASK			0x00ff0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL1_STENCIL_MASK_WRITE_SHIFT			24
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL1_STENCIL_MASK_WRITE_MASK			0xff000000
+#define  NV04_DX6_MULTITEX_TRIANGLE_CONTROL2						0x00000344
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL2_STENCIL_OP_FAIL_SHIFT			0
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL2_STENCIL_OP_FAIL_MASK			0x0000000f
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL2_STENCIL_OP_ZFAIL_SHIFT			4
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL2_STENCIL_OP_ZFAIL_MASK			0x000000f0
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL2_STENCIL_OP_ZPASS_SHIFT			8
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL2_STENCIL_OP_ZPASS_MASK			0x00000f00
+#define  NV04_DX6_MULTITEX_TRIANGLE_FOGCOLOR						0x00000348
+#define   NV04_DX6_MULTITEX_TRIANGLE_FOGCOLOR_B_SHIFT					0
+#define   NV04_DX6_MULTITEX_TRIANGLE_FOGCOLOR_B_MASK					0x000000ff
+#define   NV04_DX6_MULTITEX_TRIANGLE_FOGCOLOR_G_SHIFT					8
+#define   NV04_DX6_MULTITEX_TRIANGLE_FOGCOLOR_G_MASK					0x0000ff00
+#define   NV04_DX6_MULTITEX_TRIANGLE_FOGCOLOR_R_SHIFT					16
+#define   NV04_DX6_MULTITEX_TRIANGLE_FOGCOLOR_R_MASK					0x00ff0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_FOGCOLOR_A_SHIFT					24
+#define   NV04_DX6_MULTITEX_TRIANGLE_FOGCOLOR_A_MASK					0xff000000
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SX(x)					(0x00000400+((x)*40))
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SX__SIZE					0x00000008
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SY(x)					(0x00000404+((x)*40))
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SY__SIZE					0x00000008
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SZ(x)					(0x00000408+((x)*40))
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SZ__SIZE					0x00000008
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_RHW(x)					(0x0000040c+((x)*40))
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_RHW__SIZE				0x00000008
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR(x)					(0x00000410+((x)*40))
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR__SIZE				0x00000008
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR_B_SHIFT				0
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR_B_MASK				0x000000ff
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR_G_SHIFT				8
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR_G_MASK				0x0000ff00
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR_R_SHIFT				16
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR_R_MASK				0x00ff0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR_A_SHIFT				24
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR_A_MASK				0xff000000
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR(x)				(0x00000414+((x)*40))
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR__SIZE				0x00000008
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR_B_SHIFT			0
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR_B_MASK				0x000000ff
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR_G_SHIFT			8
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR_G_MASK				0x0000ff00
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR_R_SHIFT			16
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR_R_MASK				0x00ff0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR_FOG_SHIFT			24
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR_FOG_MASK			0xff000000
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_TU0(x)					(0x00000418+((x)*40))
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_TU0__SIZE				0x00000008
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_TV0(x)					(0x0000041c+((x)*40))
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_TV0__SIZE				0x00000008
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_TU1(x)					(0x00000420+((x)*40))
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_TU1__SIZE				0x00000008
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_TV1(x)					(0x00000424+((x)*40))
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_TV1__SIZE				0x00000008
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE(x)				(0x00000540+((x)*4))
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE__SIZE			0x00000030
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE_I0_SHIFT			0
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE_I0_MASK			0x0000000f
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE_I1_SHIFT			4
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE_I1_MASK			0x000000f0
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE_I2_SHIFT			8
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE_I2_MASK			0x00000f00
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE_I3_SHIFT			12
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE_I3_MASK			0x0000f000
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE_I4_SHIFT			16
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE_I4_MASK			0x000f0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE_I5_SHIFT			20
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE_I5_MASK			0x00f00000
+
+
+#define NV10_DX5_TEXTURED_TRIANGLE							0x00000094
+
+
+
+#define NV10TCL										0x00000056
+
+#define  NV10TCL_NOP									0x00000100
+#define  NV10TCL_NOTIFY									0x00000104
+#define  NV10TCL_DMA_NOTIFY								0x00000180
+#define  NV10TCL_DMA_IN_MEMORY0								0x00000184
+#define  NV10TCL_DMA_IN_MEMORY1								0x00000188
+#define  NV10TCL_DMA_VTXBUF0								0x0000018c
+#define  NV10TCL_DMA_IN_MEMORY2								0x00000194
+#define  NV10TCL_DMA_IN_MEMORY3								0x00000198
+#define  NV10TCL_RT_HORIZ								0x00000200
+#define   NV10TCL_RT_HORIZ_X_SHIFT							0
+#define   NV10TCL_RT_HORIZ_X_MASK							0x0000ffff
+#define   NV10TCL_RT_HORIZ_W_SHIFT							16
+#define   NV10TCL_RT_HORIZ_W_MASK							0xffff0000
+#define  NV10TCL_RT_VERT								0x00000204
+#define   NV10TCL_RT_VERT_Y_SHIFT							0
+#define   NV10TCL_RT_VERT_Y_MASK							0x0000ffff
+#define   NV10TCL_RT_VERT_H_SHIFT							16
+#define   NV10TCL_RT_VERT_H_MASK							0xffff0000
+#define  NV10TCL_RT_FORMAT								0x00000208
+#define   NV10TCL_RT_FORMAT_TYPE_SHIFT							8
+#define   NV10TCL_RT_FORMAT_TYPE_MASK							0x00000f00
+#define    NV10TCL_RT_FORMAT_TYPE_LINEAR						0x00000100
+#define    NV10TCL_RT_FORMAT_TYPE_SWIZZLED						0x00000200
+#define   NV10TCL_RT_FORMAT_COLOR_SHIFT							0
+#define   NV10TCL_RT_FORMAT_COLOR_MASK							0x0000001f
+#define    NV10TCL_RT_FORMAT_COLOR_R5G6B5						0x00000003
+#define    NV10TCL_RT_FORMAT_COLOR_X8R8G8B8						0x00000005
+#define    NV10TCL_RT_FORMAT_COLOR_A8R8G8B8						0x00000008
+#define    NV10TCL_RT_FORMAT_COLOR_B8							0x00000009
+#define    NV10TCL_RT_FORMAT_COLOR_X8B8G8R8						0x0000000f
+#define    NV10TCL_RT_FORMAT_COLOR_A8B8G8R8						0x00000010
+#define  NV10TCL_RT_PITCH								0x0000020c
+#define   NV10TCL_RT_PITCH_COLOR_PITCH_SHIFT						0
+#define   NV10TCL_RT_PITCH_COLOR_PITCH_MASK						0x0000ffff
+#define   NV10TCL_RT_PITCH_ZETA_PITCH_SHIFT						16
+#define   NV10TCL_RT_PITCH_ZETA_PITCH_MASK						0xffff0000
+#define  NV10TCL_COLOR_OFFSET								0x00000210
+#define  NV10TCL_ZETA_OFFSET								0x00000214
+#define  NV10TCL_TX_OFFSET(x)								(0x00000218+((x)*4))
+#define  NV10TCL_TX_OFFSET__SIZE							0x00000002
+#define  NV10TCL_TX_FORMAT(x)								(0x00000220+((x)*4))
+#define  NV10TCL_TX_FORMAT__SIZE							0x00000002
+#define   NV10TCL_TX_FORMAT_DMA0							(1 <<  0)
+#define   NV10TCL_TX_FORMAT_DMA1							(1 <<  1)
+#define   NV10TCL_TX_FORMAT_CUBE_MAP							(1 <<  2)
+#define   NV10TCL_TX_FORMAT_FORMAT_SHIFT						7
+#define   NV10TCL_TX_FORMAT_FORMAT_MASK							0x00000780
+#define    NV10TCL_TX_FORMAT_FORMAT_L8							0x00000000
+#define    NV10TCL_TX_FORMAT_FORMAT_A8							0x00000080
+#define    NV10TCL_TX_FORMAT_FORMAT_A1R5G5B5						0x00000100
+#define    NV10TCL_TX_FORMAT_FORMAT_A8_RECT						0x00000180
+#define    NV10TCL_TX_FORMAT_FORMAT_A4R4G4B4						0x00000200
+#define    NV10TCL_TX_FORMAT_FORMAT_R5G6B5						0x00000280
+#define    NV10TCL_TX_FORMAT_FORMAT_A8R8G8B8						0x00000300
+#define    NV10TCL_TX_FORMAT_FORMAT_X8R8G8B8						0x00000380
+#define    NV10TCL_TX_FORMAT_FORMAT_INDEX8						0x00000580
+#define    NV10TCL_TX_FORMAT_FORMAT_DXT1						0x00000600
+#define    NV10TCL_TX_FORMAT_FORMAT_DXT3						0x00000700
+#define    NV10TCL_TX_FORMAT_FORMAT_DXT5						0x00000780
+#define    NV10TCL_TX_FORMAT_FORMAT_A1R5G5B5_RECT					0x00000800
+#define    NV10TCL_TX_FORMAT_FORMAT_R5G6B5_RECT						0x00000880
+#define    NV10TCL_TX_FORMAT_FORMAT_A8R8G8B8_RECT					0x00000900
+#define    NV10TCL_TX_FORMAT_FORMAT_L8_RECT						0x00000980
+#define    NV10TCL_TX_FORMAT_FORMAT_A8L8						0x00000d00
+#define    NV10TCL_TX_FORMAT_FORMAT_A8_RECT2						0x00000d80
+#define    NV10TCL_TX_FORMAT_FORMAT_A4R4G4B4_RECT					0x00000e80
+#define    NV10TCL_TX_FORMAT_FORMAT_R8G8B8_RECT						0x00000f00
+#define    NV10TCL_TX_FORMAT_FORMAT_L8A8_RECT						0x00001000
+#define    NV10TCL_TX_FORMAT_FORMAT_A16							0x00001900
+#define    NV10TCL_TX_FORMAT_FORMAT_HILO16						0x00001980
+#define    NV10TCL_TX_FORMAT_FORMAT_A16_RECT						0x00001a80
+#define    NV10TCL_TX_FORMAT_FORMAT_HILO16_RECT						0x00001b00
+#define    NV10TCL_TX_FORMAT_FORMAT_HILO8						0x00002200
+#define    NV10TCL_TX_FORMAT_FORMAT_SIGNED_HILO8					0x00002280
+#define    NV10TCL_TX_FORMAT_FORMAT_HILO8_RECT						0x00002300
+#define    NV10TCL_TX_FORMAT_FORMAT_SIGNED_HILO8_RECT					0x00002380
+#define    NV10TCL_TX_FORMAT_FORMAT_FLOAT_RGBA16_NV					0x00002500
+#define    NV10TCL_TX_FORMAT_FORMAT_FLOAT_RGBA32_NV					0x00002580
+#define    NV10TCL_TX_FORMAT_FORMAT_FLOAT_R32_NV					0x00002600
+#define   NV10TCL_TX_FORMAT_NPOT							(1 << 11)
+#define   NV10TCL_TX_FORMAT_MIPMAP_LEVELS_SHIFT						12
+#define   NV10TCL_TX_FORMAT_MIPMAP_LEVELS_MASK						0x0000f000
+#define   NV10TCL_TX_FORMAT_BASE_SIZE_U_SHIFT						16
+#define   NV10TCL_TX_FORMAT_BASE_SIZE_U_MASK						0x000f0000
+#define   NV10TCL_TX_FORMAT_BASE_SIZE_V_SHIFT						20
+#define   NV10TCL_TX_FORMAT_BASE_SIZE_V_MASK						0x00f00000
+#define   NV10TCL_TX_FORMAT_WRAP_S_SHIFT						24
+#define   NV10TCL_TX_FORMAT_WRAP_S_MASK							0x0f000000
+#define    NV10TCL_TX_FORMAT_WRAP_S_REPEAT						0x01000000
+#define    NV10TCL_TX_FORMAT_WRAP_S_MIRRORED_REPEAT					0x02000000
+#define    NV10TCL_TX_FORMAT_WRAP_S_CLAMP_TO_EDGE					0x03000000
+#define    NV10TCL_TX_FORMAT_WRAP_S_CLAMP_TO_BORDER					0x04000000
+#define    NV10TCL_TX_FORMAT_WRAP_S_CLAMP						0x05000000
+#define   NV10TCL_TX_FORMAT_WRAP_T_SHIFT						28
+#define   NV10TCL_TX_FORMAT_WRAP_T_MASK							0xf0000000
+#define    NV10TCL_TX_FORMAT_WRAP_T_REPEAT						0x10000000
+#define    NV10TCL_TX_FORMAT_WRAP_T_MIRRORED_REPEAT					0x20000000
+#define    NV10TCL_TX_FORMAT_WRAP_T_CLAMP_TO_EDGE					0x30000000
+#define    NV10TCL_TX_FORMAT_WRAP_T_CLAMP_TO_BORDER					0x40000000
+#define    NV10TCL_TX_FORMAT_WRAP_T_CLAMP						0x50000000
+#define  NV10TCL_TX_ENABLE(x)								(0x00000228+((x)*4))
+#define  NV10TCL_TX_ENABLE__SIZE							0x00000002
+#define   NV10TCL_TX_ENABLE_ANISOTROPY_SHIFT						4
+#define   NV10TCL_TX_ENABLE_ANISOTROPY_MASK						0x00000030
+#define   NV10TCL_TX_ENABLE_ENABLE							(1 << 30)
+#define  NV10TCL_TX_NPOT_PITCH(x)							(0x00000230+((x)*4))
+#define  NV10TCL_TX_NPOT_PITCH__SIZE							0x00000002
+#define   NV10TCL_TX_NPOT_PITCH_PITCH_SHIFT						16
+#define   NV10TCL_TX_NPOT_PITCH_PITCH_MASK						0xffff0000
+#define  NV10TCL_TX_NPOT_SIZE(x)							(0x00000240+((x)*4))
+#define  NV10TCL_TX_NPOT_SIZE__SIZE							0x00000002
+#define   NV10TCL_TX_NPOT_SIZE_H_SHIFT							0
+#define   NV10TCL_TX_NPOT_SIZE_H_MASK							0x0000ffff
+#define   NV10TCL_TX_NPOT_SIZE_W_SHIFT							16
+#define   NV10TCL_TX_NPOT_SIZE_W_MASK							0xffff0000
+#define  NV10TCL_TX_FILTER(x)								(0x00000248+((x)*4))
+#define  NV10TCL_TX_FILTER__SIZE							0x00000002
+#define   NV10TCL_TX_FILTER_MINIFY_SHIFT						24
+#define   NV10TCL_TX_FILTER_MINIFY_MASK							0x0f000000
+#define    NV10TCL_TX_FILTER_MINIFY_NEAREST						0x01000000
+#define    NV10TCL_TX_FILTER_MINIFY_LINEAR						0x02000000
+#define    NV10TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST				0x03000000
+#define    NV10TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST				0x04000000
+#define    NV10TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR				0x05000000
+#define    NV10TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR				0x06000000
+#define   NV10TCL_TX_FILTER_MAGNIFY_SHIFT						28
+#define   NV10TCL_TX_FILTER_MAGNIFY_MASK						0xf0000000
+#define    NV10TCL_TX_FILTER_MAGNIFY_NEAREST						0x10000000
+#define    NV10TCL_TX_FILTER_MAGNIFY_LINEAR						0x20000000
+#define  NV10TCL_TX_PALETTE_OFFSET(x)							(0x00000250+((x)*4))
+#define  NV10TCL_TX_PALETTE_OFFSET__SIZE						0x00000002
+#define  NV10TCL_RC_IN_ALPHA(x)								(0x00000260+((x)*4))
+#define  NV10TCL_RC_IN_ALPHA__SIZE							0x00000002
+#define   NV10TCL_RC_IN_ALPHA_D_INPUT_SHIFT						0
+#define   NV10TCL_RC_IN_ALPHA_D_INPUT_MASK						0x0000000f
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_CONSTANT_COLOR0_NV				0x00000001
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_CONSTANT_COLOR1_NV				0x00000002
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_FOG						0x00000003
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_PRIMARY_COLOR_NV					0x00000004
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_SECONDARY_COLOR_NV				0x00000005
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_TEXTURE0_ARB					0x00000008
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_TEXTURE1_ARB					0x00000009
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_SPARE0_NV					0x0000000c
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_SPARE1_NV					0x0000000d
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0000000e
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_E_TIMES_F_NV					0x0000000f
+#define   NV10TCL_RC_IN_ALPHA_D_COMPONENT_USAGE						(1 <<  4)
+#define    NV10TCL_RC_IN_ALPHA_D_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV10TCL_RC_IN_ALPHA_D_COMPONENT_USAGE_ALPHA					0x00000010
+#define   NV10TCL_RC_IN_ALPHA_D_MAPPING_SHIFT						5
+#define   NV10TCL_RC_IN_ALPHA_D_MAPPING_MASK						0x000000e0
+#define    NV10TCL_RC_IN_ALPHA_D_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_IN_ALPHA_D_MAPPING_UNSIGNED_INVERT_NV				0x00000020
+#define    NV10TCL_RC_IN_ALPHA_D_MAPPING_EXPAND_NORMAL_NV				0x00000040
+#define    NV10TCL_RC_IN_ALPHA_D_MAPPING_EXPAND_NEGATE_NV				0x00000060
+#define    NV10TCL_RC_IN_ALPHA_D_MAPPING_HALF_BIAS_NORMAL_NV				0x00000080
+#define    NV10TCL_RC_IN_ALPHA_D_MAPPING_HALF_BIAS_NEGATE_NV				0x000000a0
+#define    NV10TCL_RC_IN_ALPHA_D_MAPPING_SIGNED_IDENTITY_NV				0x000000c0
+#define    NV10TCL_RC_IN_ALPHA_D_MAPPING_SIGNED_NEGATE_NV				0x000000e0
+#define   NV10TCL_RC_IN_ALPHA_C_INPUT_SHIFT						8
+#define   NV10TCL_RC_IN_ALPHA_C_INPUT_MASK						0x00000f00
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_CONSTANT_COLOR0_NV				0x00000100
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_CONSTANT_COLOR1_NV				0x00000200
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_FOG						0x00000300
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_PRIMARY_COLOR_NV					0x00000400
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_SECONDARY_COLOR_NV				0x00000500
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_TEXTURE0_ARB					0x00000800
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_TEXTURE1_ARB					0x00000900
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_SPARE0_NV					0x00000c00
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_SPARE1_NV					0x00000d00
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x00000e00
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_E_TIMES_F_NV					0x00000f00
+#define   NV10TCL_RC_IN_ALPHA_C_COMPONENT_USAGE						(1 << 12)
+#define    NV10TCL_RC_IN_ALPHA_C_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV10TCL_RC_IN_ALPHA_C_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV10TCL_RC_IN_ALPHA_C_MAPPING_SHIFT						13
+#define   NV10TCL_RC_IN_ALPHA_C_MAPPING_MASK						0x0000e000
+#define    NV10TCL_RC_IN_ALPHA_C_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_IN_ALPHA_C_MAPPING_UNSIGNED_INVERT_NV				0x00002000
+#define    NV10TCL_RC_IN_ALPHA_C_MAPPING_EXPAND_NORMAL_NV				0x00004000
+#define    NV10TCL_RC_IN_ALPHA_C_MAPPING_EXPAND_NEGATE_NV				0x00006000
+#define    NV10TCL_RC_IN_ALPHA_C_MAPPING_HALF_BIAS_NORMAL_NV				0x00008000
+#define    NV10TCL_RC_IN_ALPHA_C_MAPPING_HALF_BIAS_NEGATE_NV				0x0000a000
+#define    NV10TCL_RC_IN_ALPHA_C_MAPPING_SIGNED_IDENTITY_NV				0x0000c000
+#define    NV10TCL_RC_IN_ALPHA_C_MAPPING_SIGNED_NEGATE_NV				0x0000e000
+#define   NV10TCL_RC_IN_ALPHA_B_INPUT_SHIFT						16
+#define   NV10TCL_RC_IN_ALPHA_B_INPUT_MASK						0x000f0000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_CONSTANT_COLOR0_NV				0x00010000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_CONSTANT_COLOR1_NV				0x00020000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_FOG						0x00030000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_PRIMARY_COLOR_NV					0x00040000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_SECONDARY_COLOR_NV				0x00050000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_TEXTURE0_ARB					0x00080000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_TEXTURE1_ARB					0x00090000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_SPARE0_NV					0x000c0000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_SPARE1_NV					0x000d0000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x000e0000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_E_TIMES_F_NV					0x000f0000
+#define   NV10TCL_RC_IN_ALPHA_B_COMPONENT_USAGE						(1 << 20)
+#define    NV10TCL_RC_IN_ALPHA_B_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV10TCL_RC_IN_ALPHA_B_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV10TCL_RC_IN_ALPHA_B_MAPPING_SHIFT						21
+#define   NV10TCL_RC_IN_ALPHA_B_MAPPING_MASK						0x00e00000
+#define    NV10TCL_RC_IN_ALPHA_B_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_IN_ALPHA_B_MAPPING_UNSIGNED_INVERT_NV				0x00200000
+#define    NV10TCL_RC_IN_ALPHA_B_MAPPING_EXPAND_NORMAL_NV				0x00400000
+#define    NV10TCL_RC_IN_ALPHA_B_MAPPING_EXPAND_NEGATE_NV				0x00600000
+#define    NV10TCL_RC_IN_ALPHA_B_MAPPING_HALF_BIAS_NORMAL_NV				0x00800000
+#define    NV10TCL_RC_IN_ALPHA_B_MAPPING_HALF_BIAS_NEGATE_NV				0x00a00000
+#define    NV10TCL_RC_IN_ALPHA_B_MAPPING_SIGNED_IDENTITY_NV				0x00c00000
+#define    NV10TCL_RC_IN_ALPHA_B_MAPPING_SIGNED_NEGATE_NV				0x00e00000
+#define   NV10TCL_RC_IN_ALPHA_A_INPUT_SHIFT						24
+#define   NV10TCL_RC_IN_ALPHA_A_INPUT_MASK						0x0f000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_CONSTANT_COLOR0_NV				0x01000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_CONSTANT_COLOR1_NV				0x02000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_FOG						0x03000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_PRIMARY_COLOR_NV					0x04000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_SECONDARY_COLOR_NV				0x05000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_TEXTURE0_ARB					0x08000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_TEXTURE1_ARB					0x09000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_SPARE0_NV					0x0c000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_SPARE1_NV					0x0d000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0e000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_E_TIMES_F_NV					0x0f000000
+#define   NV10TCL_RC_IN_ALPHA_A_COMPONENT_USAGE						(1 << 28)
+#define    NV10TCL_RC_IN_ALPHA_A_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV10TCL_RC_IN_ALPHA_A_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV10TCL_RC_IN_ALPHA_A_MAPPING_SHIFT						29
+#define   NV10TCL_RC_IN_ALPHA_A_MAPPING_MASK						0xe0000000
+#define    NV10TCL_RC_IN_ALPHA_A_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_IN_ALPHA_A_MAPPING_UNSIGNED_INVERT_NV				0x20000000
+#define    NV10TCL_RC_IN_ALPHA_A_MAPPING_EXPAND_NORMAL_NV				0x40000000
+#define    NV10TCL_RC_IN_ALPHA_A_MAPPING_EXPAND_NEGATE_NV				0x60000000
+#define    NV10TCL_RC_IN_ALPHA_A_MAPPING_HALF_BIAS_NORMAL_NV				0x80000000
+#define    NV10TCL_RC_IN_ALPHA_A_MAPPING_HALF_BIAS_NEGATE_NV				0xa0000000
+#define    NV10TCL_RC_IN_ALPHA_A_MAPPING_SIGNED_IDENTITY_NV				0xc0000000
+#define    NV10TCL_RC_IN_ALPHA_A_MAPPING_SIGNED_NEGATE_NV				0xe0000000
+#define  NV10TCL_RC_IN_RGB(x)								(0x00000268+((x)*4))
+#define  NV10TCL_RC_IN_RGB__SIZE							0x00000002
+#define   NV10TCL_RC_IN_RGB_D_INPUT_SHIFT						0
+#define   NV10TCL_RC_IN_RGB_D_INPUT_MASK						0x0000000f
+#define    NV10TCL_RC_IN_RGB_D_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_IN_RGB_D_INPUT_CONSTANT_COLOR0_NV					0x00000001
+#define    NV10TCL_RC_IN_RGB_D_INPUT_CONSTANT_COLOR1_NV					0x00000002
+#define    NV10TCL_RC_IN_RGB_D_INPUT_FOG						0x00000003
+#define    NV10TCL_RC_IN_RGB_D_INPUT_PRIMARY_COLOR_NV					0x00000004
+#define    NV10TCL_RC_IN_RGB_D_INPUT_SECONDARY_COLOR_NV					0x00000005
+#define    NV10TCL_RC_IN_RGB_D_INPUT_TEXTURE0_ARB					0x00000008
+#define    NV10TCL_RC_IN_RGB_D_INPUT_TEXTURE1_ARB					0x00000009
+#define    NV10TCL_RC_IN_RGB_D_INPUT_SPARE0_NV						0x0000000c
+#define    NV10TCL_RC_IN_RGB_D_INPUT_SPARE1_NV						0x0000000d
+#define    NV10TCL_RC_IN_RGB_D_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0000000e
+#define    NV10TCL_RC_IN_RGB_D_INPUT_E_TIMES_F_NV					0x0000000f
+#define   NV10TCL_RC_IN_RGB_D_COMPONENT_USAGE						(1 <<  4)
+#define    NV10TCL_RC_IN_RGB_D_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_IN_RGB_D_COMPONENT_USAGE_ALPHA					0x00000010
+#define   NV10TCL_RC_IN_RGB_D_MAPPING_SHIFT						5
+#define   NV10TCL_RC_IN_RGB_D_MAPPING_MASK						0x000000e0
+#define    NV10TCL_RC_IN_RGB_D_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_IN_RGB_D_MAPPING_UNSIGNED_INVERT_NV				0x00000020
+#define    NV10TCL_RC_IN_RGB_D_MAPPING_EXPAND_NORMAL_NV					0x00000040
+#define    NV10TCL_RC_IN_RGB_D_MAPPING_EXPAND_NEGATE_NV					0x00000060
+#define    NV10TCL_RC_IN_RGB_D_MAPPING_HALF_BIAS_NORMAL_NV				0x00000080
+#define    NV10TCL_RC_IN_RGB_D_MAPPING_HALF_BIAS_NEGATE_NV				0x000000a0
+#define    NV10TCL_RC_IN_RGB_D_MAPPING_SIGNED_IDENTITY_NV				0x000000c0
+#define    NV10TCL_RC_IN_RGB_D_MAPPING_SIGNED_NEGATE_NV					0x000000e0
+#define   NV10TCL_RC_IN_RGB_C_INPUT_SHIFT						8
+#define   NV10TCL_RC_IN_RGB_C_INPUT_MASK						0x00000f00
+#define    NV10TCL_RC_IN_RGB_C_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_IN_RGB_C_INPUT_CONSTANT_COLOR0_NV					0x00000100
+#define    NV10TCL_RC_IN_RGB_C_INPUT_CONSTANT_COLOR1_NV					0x00000200
+#define    NV10TCL_RC_IN_RGB_C_INPUT_FOG						0x00000300
+#define    NV10TCL_RC_IN_RGB_C_INPUT_PRIMARY_COLOR_NV					0x00000400
+#define    NV10TCL_RC_IN_RGB_C_INPUT_SECONDARY_COLOR_NV					0x00000500
+#define    NV10TCL_RC_IN_RGB_C_INPUT_TEXTURE0_ARB					0x00000800
+#define    NV10TCL_RC_IN_RGB_C_INPUT_TEXTURE1_ARB					0x00000900
+#define    NV10TCL_RC_IN_RGB_C_INPUT_SPARE0_NV						0x00000c00
+#define    NV10TCL_RC_IN_RGB_C_INPUT_SPARE1_NV						0x00000d00
+#define    NV10TCL_RC_IN_RGB_C_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x00000e00
+#define    NV10TCL_RC_IN_RGB_C_INPUT_E_TIMES_F_NV					0x00000f00
+#define   NV10TCL_RC_IN_RGB_C_COMPONENT_USAGE						(1 << 12)
+#define    NV10TCL_RC_IN_RGB_C_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_IN_RGB_C_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV10TCL_RC_IN_RGB_C_MAPPING_SHIFT						13
+#define   NV10TCL_RC_IN_RGB_C_MAPPING_MASK						0x0000e000
+#define    NV10TCL_RC_IN_RGB_C_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_IN_RGB_C_MAPPING_UNSIGNED_INVERT_NV				0x00002000
+#define    NV10TCL_RC_IN_RGB_C_MAPPING_EXPAND_NORMAL_NV					0x00004000
+#define    NV10TCL_RC_IN_RGB_C_MAPPING_EXPAND_NEGATE_NV					0x00006000
+#define    NV10TCL_RC_IN_RGB_C_MAPPING_HALF_BIAS_NORMAL_NV				0x00008000
+#define    NV10TCL_RC_IN_RGB_C_MAPPING_HALF_BIAS_NEGATE_NV				0x0000a000
+#define    NV10TCL_RC_IN_RGB_C_MAPPING_SIGNED_IDENTITY_NV				0x0000c000
+#define    NV10TCL_RC_IN_RGB_C_MAPPING_SIGNED_NEGATE_NV					0x0000e000
+#define   NV10TCL_RC_IN_RGB_B_INPUT_SHIFT						16
+#define   NV10TCL_RC_IN_RGB_B_INPUT_MASK						0x000f0000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_CONSTANT_COLOR0_NV					0x00010000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_CONSTANT_COLOR1_NV					0x00020000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_FOG						0x00030000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_PRIMARY_COLOR_NV					0x00040000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_SECONDARY_COLOR_NV					0x00050000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_TEXTURE0_ARB					0x00080000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_TEXTURE1_ARB					0x00090000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_SPARE0_NV						0x000c0000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_SPARE1_NV						0x000d0000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x000e0000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_E_TIMES_F_NV					0x000f0000
+#define   NV10TCL_RC_IN_RGB_B_COMPONENT_USAGE						(1 << 20)
+#define    NV10TCL_RC_IN_RGB_B_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_IN_RGB_B_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV10TCL_RC_IN_RGB_B_MAPPING_SHIFT						21
+#define   NV10TCL_RC_IN_RGB_B_MAPPING_MASK						0x00e00000
+#define    NV10TCL_RC_IN_RGB_B_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_IN_RGB_B_MAPPING_UNSIGNED_INVERT_NV				0x00200000
+#define    NV10TCL_RC_IN_RGB_B_MAPPING_EXPAND_NORMAL_NV					0x00400000
+#define    NV10TCL_RC_IN_RGB_B_MAPPING_EXPAND_NEGATE_NV					0x00600000
+#define    NV10TCL_RC_IN_RGB_B_MAPPING_HALF_BIAS_NORMAL_NV				0x00800000
+#define    NV10TCL_RC_IN_RGB_B_MAPPING_HALF_BIAS_NEGATE_NV				0x00a00000
+#define    NV10TCL_RC_IN_RGB_B_MAPPING_SIGNED_IDENTITY_NV				0x00c00000
+#define    NV10TCL_RC_IN_RGB_B_MAPPING_SIGNED_NEGATE_NV					0x00e00000
+#define   NV10TCL_RC_IN_RGB_A_INPUT_SHIFT						24
+#define   NV10TCL_RC_IN_RGB_A_INPUT_MASK						0x0f000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_CONSTANT_COLOR0_NV					0x01000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_CONSTANT_COLOR1_NV					0x02000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_FOG						0x03000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_PRIMARY_COLOR_NV					0x04000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_SECONDARY_COLOR_NV					0x05000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_TEXTURE0_ARB					0x08000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_TEXTURE1_ARB					0x09000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_SPARE0_NV						0x0c000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_SPARE1_NV						0x0d000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0e000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_E_TIMES_F_NV					0x0f000000
+#define   NV10TCL_RC_IN_RGB_A_COMPONENT_USAGE						(1 << 28)
+#define    NV10TCL_RC_IN_RGB_A_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_IN_RGB_A_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV10TCL_RC_IN_RGB_A_MAPPING_SHIFT						29
+#define   NV10TCL_RC_IN_RGB_A_MAPPING_MASK						0xe0000000
+#define    NV10TCL_RC_IN_RGB_A_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_IN_RGB_A_MAPPING_UNSIGNED_INVERT_NV				0x20000000
+#define    NV10TCL_RC_IN_RGB_A_MAPPING_EXPAND_NORMAL_NV					0x40000000
+#define    NV10TCL_RC_IN_RGB_A_MAPPING_EXPAND_NEGATE_NV					0x60000000
+#define    NV10TCL_RC_IN_RGB_A_MAPPING_HALF_BIAS_NORMAL_NV				0x80000000
+#define    NV10TCL_RC_IN_RGB_A_MAPPING_HALF_BIAS_NEGATE_NV				0xa0000000
+#define    NV10TCL_RC_IN_RGB_A_MAPPING_SIGNED_IDENTITY_NV				0xc0000000
+#define    NV10TCL_RC_IN_RGB_A_MAPPING_SIGNED_NEGATE_NV					0xe0000000
+#define  NV10TCL_RC_COLOR(x)								(0x00000270+((x)*4))
+#define  NV10TCL_RC_COLOR__SIZE								0x00000002
+#define   NV10TCL_RC_COLOR_B_SHIFT							0
+#define   NV10TCL_RC_COLOR_B_MASK							0x000000ff
+#define   NV10TCL_RC_COLOR_G_SHIFT							8
+#define   NV10TCL_RC_COLOR_G_MASK							0x0000ff00
+#define   NV10TCL_RC_COLOR_R_SHIFT							16
+#define   NV10TCL_RC_COLOR_R_MASK							0x00ff0000
+#define   NV10TCL_RC_COLOR_A_SHIFT							24
+#define   NV10TCL_RC_COLOR_A_MASK							0xff000000
+#define  NV10TCL_RC_OUT_ALPHA(x)							(0x00000278+((x)*4))
+#define  NV10TCL_RC_OUT_ALPHA__SIZE							0x00000002
+#define   NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_SHIFT						0
+#define   NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_MASK						0x0000000f
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_ZERO						0x00000000
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_CONSTANT_COLOR0_NV				0x00000001
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_CONSTANT_COLOR1_NV				0x00000002
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_FOG						0x00000003
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_PRIMARY_COLOR_NV				0x00000004
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_SECONDARY_COLOR_NV				0x00000005
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_TEXTURE0_ARB					0x00000008
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_TEXTURE1_ARB					0x00000009
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_SPARE0_NV					0x0000000c
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_SPARE1_NV					0x0000000d
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR_NV		0x0000000e
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_E_TIMES_F_NV					0x0000000f
+#define   NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_SHIFT						4
+#define   NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_MASK						0x000000f0
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_ZERO						0x00000000
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_CONSTANT_COLOR0_NV				0x00000010
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_CONSTANT_COLOR1_NV				0x00000020
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_FOG						0x00000030
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_PRIMARY_COLOR_NV				0x00000040
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_SECONDARY_COLOR_NV				0x00000050
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_TEXTURE0_ARB					0x00000080
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_TEXTURE1_ARB					0x00000090
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_SPARE0_NV					0x000000c0
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_SPARE1_NV					0x000000d0
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR_NV		0x000000e0
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_E_TIMES_F_NV					0x000000f0
+#define   NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_SHIFT						8
+#define   NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_MASK						0x00000f00
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_ZERO						0x00000000
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_CONSTANT_COLOR0_NV				0x00000100
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_CONSTANT_COLOR1_NV				0x00000200
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_FOG						0x00000300
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_PRIMARY_COLOR_NV				0x00000400
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_SECONDARY_COLOR_NV				0x00000500
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_TEXTURE0_ARB					0x00000800
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_TEXTURE1_ARB					0x00000900
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_SPARE0_NV					0x00000c00
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_SPARE1_NV					0x00000d00
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR_NV		0x00000e00
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_E_TIMES_F_NV					0x00000f00
+#define   NV10TCL_RC_OUT_ALPHA_CD_DOT_PRODUCT						(1 << 12)
+#define   NV10TCL_RC_OUT_ALPHA_AB_DOT_PRODUCT						(1 << 13)
+#define   NV10TCL_RC_OUT_ALPHA_MUX_SUM							(1 << 14)
+#define   NV10TCL_RC_OUT_ALPHA_BIAS							(1 << 15)
+#define    NV10TCL_RC_OUT_ALPHA_BIAS_NONE						0x00000000
+#define    NV10TCL_RC_OUT_ALPHA_BIAS_BIAS_BY_NEGATIVE_ONE_HALF_NV			0x00008000
+#define   NV10TCL_RC_OUT_ALPHA_SCALE_SHIFT						17
+#define   NV10TCL_RC_OUT_ALPHA_SCALE_MASK						0x00000000
+#define    NV10TCL_RC_OUT_ALPHA_SCALE_NONE						0x00000000
+#define    NV10TCL_RC_OUT_ALPHA_SCALE_SCALE_BY_TWO_NV					0x00020000
+#define    NV10TCL_RC_OUT_ALPHA_SCALE_SCALE_BY_FOUR_NV					0x00040000
+#define    NV10TCL_RC_OUT_ALPHA_SCALE_SCALE_BY_ONE_HALF_NV				0x00060000
+#define  NV10TCL_RC_OUT_RGB(x)								(0x00000280+((x)*4))
+#define  NV10TCL_RC_OUT_RGB__SIZE							0x00000002
+#define   NV10TCL_RC_OUT_RGB_CD_OUTPUT_SHIFT						0
+#define   NV10TCL_RC_OUT_RGB_CD_OUTPUT_MASK						0x0000000f
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_ZERO						0x00000000
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_CONSTANT_COLOR0_NV				0x00000001
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_CONSTANT_COLOR1_NV				0x00000002
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_FOG						0x00000003
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_PRIMARY_COLOR_NV				0x00000004
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_SECONDARY_COLOR_NV				0x00000005
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_TEXTURE0_ARB					0x00000008
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_TEXTURE1_ARB					0x00000009
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_SPARE0_NV					0x0000000c
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_SPARE1_NV					0x0000000d
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0000000e
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_E_TIMES_F_NV					0x0000000f
+#define   NV10TCL_RC_OUT_RGB_AB_OUTPUT_SHIFT						4
+#define   NV10TCL_RC_OUT_RGB_AB_OUTPUT_MASK						0x000000f0
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_ZERO						0x00000000
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_CONSTANT_COLOR0_NV				0x00000010
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_CONSTANT_COLOR1_NV				0x00000020
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_FOG						0x00000030
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_PRIMARY_COLOR_NV				0x00000040
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_SECONDARY_COLOR_NV				0x00000050
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_TEXTURE0_ARB					0x00000080
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_TEXTURE1_ARB					0x00000090
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_SPARE0_NV					0x000000c0
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_SPARE1_NV					0x000000d0
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x000000e0
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_E_TIMES_F_NV					0x000000f0
+#define   NV10TCL_RC_OUT_RGB_SUM_OUTPUT_SHIFT						8
+#define   NV10TCL_RC_OUT_RGB_SUM_OUTPUT_MASK						0x00000f00
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_ZERO						0x00000000
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_CONSTANT_COLOR0_NV				0x00000100
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_CONSTANT_COLOR1_NV				0x00000200
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_FOG						0x00000300
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_PRIMARY_COLOR_NV				0x00000400
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_SECONDARY_COLOR_NV				0x00000500
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_TEXTURE0_ARB					0x00000800
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_TEXTURE1_ARB					0x00000900
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_SPARE0_NV					0x00000c00
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_SPARE1_NV					0x00000d00
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x00000e00
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_E_TIMES_F_NV					0x00000f00
+#define   NV10TCL_RC_OUT_RGB_CD_DOT_PRODUCT						(1 << 12)
+#define   NV10TCL_RC_OUT_RGB_AB_DOT_PRODUCT						(1 << 13)
+#define   NV10TCL_RC_OUT_RGB_MUX_SUM							(1 << 14)
+#define   NV10TCL_RC_OUT_RGB_BIAS							(1 << 15)
+#define    NV10TCL_RC_OUT_RGB_BIAS_NONE							0x00000000
+#define    NV10TCL_RC_OUT_RGB_BIAS_BIAS_BY_NEGATIVE_ONE_HALF_NV				0x00008000
+#define   NV10TCL_RC_OUT_RGB_SCALE_SHIFT						17
+#define   NV10TCL_RC_OUT_RGB_SCALE_MASK							0x00000000
+#define    NV10TCL_RC_OUT_RGB_SCALE_NONE						0x00000000
+#define    NV10TCL_RC_OUT_RGB_SCALE_SCALE_BY_TWO_NV					0x00020000
+#define    NV10TCL_RC_OUT_RGB_SCALE_SCALE_BY_FOUR_NV					0x00040000
+#define    NV10TCL_RC_OUT_RGB_SCALE_SCALE_BY_ONE_HALF_NV				0x00060000
+#define   NV10TCL_RC_OUT_RGB_OPERATION_SHIFT						27
+#define   NV10TCL_RC_OUT_RGB_OPERATION_MASK						0x38000000
+#define  NV10TCL_RC_FINAL0								0x00000288
+#define   NV10TCL_RC_FINAL0_D_INPUT_SHIFT						0
+#define   NV10TCL_RC_FINAL0_D_INPUT_MASK						0x0000000f
+#define    NV10TCL_RC_FINAL0_D_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_FINAL0_D_INPUT_CONSTANT_COLOR0_NV					0x00000001
+#define    NV10TCL_RC_FINAL0_D_INPUT_CONSTANT_COLOR1_NV					0x00000002
+#define    NV10TCL_RC_FINAL0_D_INPUT_FOG						0x00000003
+#define    NV10TCL_RC_FINAL0_D_INPUT_PRIMARY_COLOR_NV					0x00000004
+#define    NV10TCL_RC_FINAL0_D_INPUT_SECONDARY_COLOR_NV					0x00000005
+#define    NV10TCL_RC_FINAL0_D_INPUT_TEXTURE0_ARB					0x00000008
+#define    NV10TCL_RC_FINAL0_D_INPUT_TEXTURE1_ARB					0x00000009
+#define    NV10TCL_RC_FINAL0_D_INPUT_SPARE0_NV						0x0000000c
+#define    NV10TCL_RC_FINAL0_D_INPUT_SPARE1_NV						0x0000000d
+#define    NV10TCL_RC_FINAL0_D_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0000000e
+#define    NV10TCL_RC_FINAL0_D_INPUT_E_TIMES_F_NV					0x0000000f
+#define   NV10TCL_RC_FINAL0_D_COMPONENT_USAGE						(1 <<  4)
+#define    NV10TCL_RC_FINAL0_D_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_FINAL0_D_COMPONENT_USAGE_ALPHA					0x00000010
+#define   NV10TCL_RC_FINAL0_D_MAPPING_SHIFT						5
+#define   NV10TCL_RC_FINAL0_D_MAPPING_MASK						0x000000e0
+#define    NV10TCL_RC_FINAL0_D_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_FINAL0_D_MAPPING_UNSIGNED_INVERT_NV				0x00000020
+#define    NV10TCL_RC_FINAL0_D_MAPPING_EXPAND_NORMAL_NV					0x00000040
+#define    NV10TCL_RC_FINAL0_D_MAPPING_EXPAND_NEGATE_NV					0x00000060
+#define    NV10TCL_RC_FINAL0_D_MAPPING_HALF_BIAS_NORMAL_NV				0x00000080
+#define    NV10TCL_RC_FINAL0_D_MAPPING_HALF_BIAS_NEGATE_NV				0x000000a0
+#define    NV10TCL_RC_FINAL0_D_MAPPING_SIGNED_IDENTITY_NV				0x000000c0
+#define    NV10TCL_RC_FINAL0_D_MAPPING_SIGNED_NEGATE_NV					0x000000e0
+#define   NV10TCL_RC_FINAL0_C_INPUT_SHIFT						8
+#define   NV10TCL_RC_FINAL0_C_INPUT_MASK						0x00000f00
+#define    NV10TCL_RC_FINAL0_C_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_FINAL0_C_INPUT_CONSTANT_COLOR0_NV					0x00000100
+#define    NV10TCL_RC_FINAL0_C_INPUT_CONSTANT_COLOR1_NV					0x00000200
+#define    NV10TCL_RC_FINAL0_C_INPUT_FOG						0x00000300
+#define    NV10TCL_RC_FINAL0_C_INPUT_PRIMARY_COLOR_NV					0x00000400
+#define    NV10TCL_RC_FINAL0_C_INPUT_SECONDARY_COLOR_NV					0x00000500
+#define    NV10TCL_RC_FINAL0_C_INPUT_TEXTURE0_ARB					0x00000800
+#define    NV10TCL_RC_FINAL0_C_INPUT_TEXTURE1_ARB					0x00000900
+#define    NV10TCL_RC_FINAL0_C_INPUT_SPARE0_NV						0x00000c00
+#define    NV10TCL_RC_FINAL0_C_INPUT_SPARE1_NV						0x00000d00
+#define    NV10TCL_RC_FINAL0_C_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x00000e00
+#define    NV10TCL_RC_FINAL0_C_INPUT_E_TIMES_F_NV					0x00000f00
+#define   NV10TCL_RC_FINAL0_C_COMPONENT_USAGE						(1 << 12)
+#define    NV10TCL_RC_FINAL0_C_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_FINAL0_C_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV10TCL_RC_FINAL0_C_MAPPING_SHIFT						13
+#define   NV10TCL_RC_FINAL0_C_MAPPING_MASK						0x0000e000
+#define    NV10TCL_RC_FINAL0_C_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_FINAL0_C_MAPPING_UNSIGNED_INVERT_NV				0x00002000
+#define    NV10TCL_RC_FINAL0_C_MAPPING_EXPAND_NORMAL_NV					0x00004000
+#define    NV10TCL_RC_FINAL0_C_MAPPING_EXPAND_NEGATE_NV					0x00006000
+#define    NV10TCL_RC_FINAL0_C_MAPPING_HALF_BIAS_NORMAL_NV				0x00008000
+#define    NV10TCL_RC_FINAL0_C_MAPPING_HALF_BIAS_NEGATE_NV				0x0000a000
+#define    NV10TCL_RC_FINAL0_C_MAPPING_SIGNED_IDENTITY_NV				0x0000c000
+#define    NV10TCL_RC_FINAL0_C_MAPPING_SIGNED_NEGATE_NV					0x0000e000
+#define   NV10TCL_RC_FINAL0_B_INPUT_SHIFT						16
+#define   NV10TCL_RC_FINAL0_B_INPUT_MASK						0x000f0000
+#define    NV10TCL_RC_FINAL0_B_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_FINAL0_B_INPUT_CONSTANT_COLOR0_NV					0x00010000
+#define    NV10TCL_RC_FINAL0_B_INPUT_CONSTANT_COLOR1_NV					0x00020000
+#define    NV10TCL_RC_FINAL0_B_INPUT_FOG						0x00030000
+#define    NV10TCL_RC_FINAL0_B_INPUT_PRIMARY_COLOR_NV					0x00040000
+#define    NV10TCL_RC_FINAL0_B_INPUT_SECONDARY_COLOR_NV					0x00050000
+#define    NV10TCL_RC_FINAL0_B_INPUT_TEXTURE0_ARB					0x00080000
+#define    NV10TCL_RC_FINAL0_B_INPUT_TEXTURE1_ARB					0x00090000
+#define    NV10TCL_RC_FINAL0_B_INPUT_SPARE0_NV						0x000c0000
+#define    NV10TCL_RC_FINAL0_B_INPUT_SPARE1_NV						0x000d0000
+#define    NV10TCL_RC_FINAL0_B_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x000e0000
+#define    NV10TCL_RC_FINAL0_B_INPUT_E_TIMES_F_NV					0x000f0000
+#define   NV10TCL_RC_FINAL0_B_COMPONENT_USAGE						(1 << 20)
+#define    NV10TCL_RC_FINAL0_B_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_FINAL0_B_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV10TCL_RC_FINAL0_B_MAPPING_SHIFT						21
+#define   NV10TCL_RC_FINAL0_B_MAPPING_MASK						0x00e00000
+#define    NV10TCL_RC_FINAL0_B_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_FINAL0_B_MAPPING_UNSIGNED_INVERT_NV				0x00200000
+#define    NV10TCL_RC_FINAL0_B_MAPPING_EXPAND_NORMAL_NV					0x00400000
+#define    NV10TCL_RC_FINAL0_B_MAPPING_EXPAND_NEGATE_NV					0x00600000
+#define    NV10TCL_RC_FINAL0_B_MAPPING_HALF_BIAS_NORMAL_NV				0x00800000
+#define    NV10TCL_RC_FINAL0_B_MAPPING_HALF_BIAS_NEGATE_NV				0x00a00000
+#define    NV10TCL_RC_FINAL0_B_MAPPING_SIGNED_IDENTITY_NV				0x00c00000
+#define    NV10TCL_RC_FINAL0_B_MAPPING_SIGNED_NEGATE_NV					0x00e00000
+#define   NV10TCL_RC_FINAL0_A_INPUT_SHIFT						24
+#define   NV10TCL_RC_FINAL0_A_INPUT_MASK						0x0f000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_CONSTANT_COLOR0_NV					0x01000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_CONSTANT_COLOR1_NV					0x02000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_FOG						0x03000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_PRIMARY_COLOR_NV					0x04000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_SECONDARY_COLOR_NV					0x05000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_TEXTURE0_ARB					0x08000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_TEXTURE1_ARB					0x09000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_SPARE0_NV						0x0c000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_SPARE1_NV						0x0d000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0e000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_E_TIMES_F_NV					0x0f000000
+#define   NV10TCL_RC_FINAL0_A_COMPONENT_USAGE						(1 << 28)
+#define    NV10TCL_RC_FINAL0_A_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_FINAL0_A_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV10TCL_RC_FINAL0_A_MAPPING_SHIFT						29
+#define   NV10TCL_RC_FINAL0_A_MAPPING_MASK						0xe0000000
+#define    NV10TCL_RC_FINAL0_A_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_FINAL0_A_MAPPING_UNSIGNED_INVERT_NV				0x20000000
+#define    NV10TCL_RC_FINAL0_A_MAPPING_EXPAND_NORMAL_NV					0x40000000
+#define    NV10TCL_RC_FINAL0_A_MAPPING_EXPAND_NEGATE_NV					0x60000000
+#define    NV10TCL_RC_FINAL0_A_MAPPING_HALF_BIAS_NORMAL_NV				0x80000000
+#define    NV10TCL_RC_FINAL0_A_MAPPING_HALF_BIAS_NEGATE_NV				0xa0000000
+#define    NV10TCL_RC_FINAL0_A_MAPPING_SIGNED_IDENTITY_NV				0xc0000000
+#define    NV10TCL_RC_FINAL0_A_MAPPING_SIGNED_NEGATE_NV					0xe0000000
+#define  NV10TCL_RC_FINAL1								0x0000028c
+#define   NV10TCL_RC_FINAL1_COLOR_SUM_CLAMP						(1 <<  7)
+#define   NV10TCL_RC_FINAL1_G_INPUT_SHIFT						8
+#define   NV10TCL_RC_FINAL1_G_INPUT_MASK						0x00000f00
+#define    NV10TCL_RC_FINAL1_G_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_FINAL1_G_INPUT_CONSTANT_COLOR0_NV					0x00000100
+#define    NV10TCL_RC_FINAL1_G_INPUT_CONSTANT_COLOR1_NV					0x00000200
+#define    NV10TCL_RC_FINAL1_G_INPUT_FOG						0x00000300
+#define    NV10TCL_RC_FINAL1_G_INPUT_PRIMARY_COLOR_NV					0x00000400
+#define    NV10TCL_RC_FINAL1_G_INPUT_SECONDARY_COLOR_NV					0x00000500
+#define    NV10TCL_RC_FINAL1_G_INPUT_TEXTURE0_ARB					0x00000800
+#define    NV10TCL_RC_FINAL1_G_INPUT_TEXTURE1_ARB					0x00000900
+#define    NV10TCL_RC_FINAL1_G_INPUT_SPARE0_NV						0x00000c00
+#define    NV10TCL_RC_FINAL1_G_INPUT_SPARE1_NV						0x00000d00
+#define    NV10TCL_RC_FINAL1_G_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x00000e00
+#define    NV10TCL_RC_FINAL1_G_INPUT_E_TIMES_F_NV					0x00000f00
+#define   NV10TCL_RC_FINAL1_G_COMPONENT_USAGE						(1 << 12)
+#define    NV10TCL_RC_FINAL1_G_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_FINAL1_G_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV10TCL_RC_FINAL1_G_MAPPING_SHIFT						13
+#define   NV10TCL_RC_FINAL1_G_MAPPING_MASK						0x0000e000
+#define    NV10TCL_RC_FINAL1_G_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_FINAL1_G_MAPPING_UNSIGNED_INVERT_NV				0x00002000
+#define    NV10TCL_RC_FINAL1_G_MAPPING_EXPAND_NORMAL_NV					0x00004000
+#define    NV10TCL_RC_FINAL1_G_MAPPING_EXPAND_NEGATE_NV					0x00006000
+#define    NV10TCL_RC_FINAL1_G_MAPPING_HALF_BIAS_NORMAL_NV				0x00008000
+#define    NV10TCL_RC_FINAL1_G_MAPPING_HALF_BIAS_NEGATE_NV				0x0000a000
+#define    NV10TCL_RC_FINAL1_G_MAPPING_SIGNED_IDENTITY_NV				0x0000c000
+#define    NV10TCL_RC_FINAL1_G_MAPPING_SIGNED_NEGATE_NV					0x0000e000
+#define   NV10TCL_RC_FINAL1_F_INPUT_SHIFT						16
+#define   NV10TCL_RC_FINAL1_F_INPUT_MASK						0x000f0000
+#define    NV10TCL_RC_FINAL1_F_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_FINAL1_F_INPUT_CONSTANT_COLOR0_NV					0x00010000
+#define    NV10TCL_RC_FINAL1_F_INPUT_CONSTANT_COLOR1_NV					0x00020000
+#define    NV10TCL_RC_FINAL1_F_INPUT_FOG						0x00030000
+#define    NV10TCL_RC_FINAL1_F_INPUT_PRIMARY_COLOR_NV					0x00040000
+#define    NV10TCL_RC_FINAL1_F_INPUT_SECONDARY_COLOR_NV					0x00050000
+#define    NV10TCL_RC_FINAL1_F_INPUT_TEXTURE0_ARB					0x00080000
+#define    NV10TCL_RC_FINAL1_F_INPUT_TEXTURE1_ARB					0x00090000
+#define    NV10TCL_RC_FINAL1_F_INPUT_SPARE0_NV						0x000c0000
+#define    NV10TCL_RC_FINAL1_F_INPUT_SPARE1_NV						0x000d0000
+#define    NV10TCL_RC_FINAL1_F_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x000e0000
+#define    NV10TCL_RC_FINAL1_F_INPUT_E_TIMES_F_NV					0x000f0000
+#define   NV10TCL_RC_FINAL1_F_COMPONENT_USAGE						(1 << 20)
+#define    NV10TCL_RC_FINAL1_F_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_FINAL1_F_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV10TCL_RC_FINAL1_F_MAPPING_SHIFT						21
+#define   NV10TCL_RC_FINAL1_F_MAPPING_MASK						0x00e00000
+#define    NV10TCL_RC_FINAL1_F_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_FINAL1_F_MAPPING_UNSIGNED_INVERT_NV				0x00200000
+#define    NV10TCL_RC_FINAL1_F_MAPPING_EXPAND_NORMAL_NV					0x00400000
+#define    NV10TCL_RC_FINAL1_F_MAPPING_EXPAND_NEGATE_NV					0x00600000
+#define    NV10TCL_RC_FINAL1_F_MAPPING_HALF_BIAS_NORMAL_NV				0x00800000
+#define    NV10TCL_RC_FINAL1_F_MAPPING_HALF_BIAS_NEGATE_NV				0x00a00000
+#define    NV10TCL_RC_FINAL1_F_MAPPING_SIGNED_IDENTITY_NV				0x00c00000
+#define    NV10TCL_RC_FINAL1_F_MAPPING_SIGNED_NEGATE_NV					0x00e00000
+#define   NV10TCL_RC_FINAL1_E_INPUT_SHIFT						24
+#define   NV10TCL_RC_FINAL1_E_INPUT_MASK						0x0f000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_CONSTANT_COLOR0_NV					0x01000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_CONSTANT_COLOR1_NV					0x02000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_FOG						0x03000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_PRIMARY_COLOR_NV					0x04000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_SECONDARY_COLOR_NV					0x05000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_TEXTURE0_ARB					0x08000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_TEXTURE1_ARB					0x09000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_SPARE0_NV						0x0c000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_SPARE1_NV						0x0d000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0e000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_E_TIMES_F_NV					0x0f000000
+#define   NV10TCL_RC_FINAL1_E_COMPONENT_USAGE						(1 << 28)
+#define    NV10TCL_RC_FINAL1_E_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_FINAL1_E_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV10TCL_RC_FINAL1_E_MAPPING_SHIFT						29
+#define   NV10TCL_RC_FINAL1_E_MAPPING_MASK						0xe0000000
+#define    NV10TCL_RC_FINAL1_E_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_FINAL1_E_MAPPING_UNSIGNED_INVERT_NV				0x20000000
+#define    NV10TCL_RC_FINAL1_E_MAPPING_EXPAND_NORMAL_NV					0x40000000
+#define    NV10TCL_RC_FINAL1_E_MAPPING_EXPAND_NEGATE_NV					0x60000000
+#define    NV10TCL_RC_FINAL1_E_MAPPING_HALF_BIAS_NORMAL_NV				0x80000000
+#define    NV10TCL_RC_FINAL1_E_MAPPING_HALF_BIAS_NEGATE_NV				0xa0000000
+#define    NV10TCL_RC_FINAL1_E_MAPPING_SIGNED_IDENTITY_NV				0xc0000000
+#define    NV10TCL_RC_FINAL1_E_MAPPING_SIGNED_NEGATE_NV					0xe0000000
+#define  NV10TCL_LIGHT_MODEL								0x00000294
+#define   NV10TCL_LIGHT_MODEL_COLOR_CONTROL						(1 <<  1)
+#define   NV10TCL_LIGHT_MODEL_LOCAL_VIEWER						(1 << 16)
+#define  NV10TCL_COLOR_MATERIAL_ENABLE							0x00000298
+#define   NV10TCL_COLOR_MATERIAL_ENABLE_SPECULAR					(1 <<  0)
+#define   NV10TCL_COLOR_MATERIAL_ENABLE_DIFFUSE						(1 <<  1)
+#define   NV10TCL_COLOR_MATERIAL_ENABLE_AMBIENT						(1 <<  2)
+#define   NV10TCL_COLOR_MATERIAL_ENABLE_EMISSION					(1 <<  3)
+#define  NV10TCL_FOG_MODE								0x0000029c
+#define   NV10TCL_FOG_MODE_EXP								0x00000800
+#define   NV10TCL_FOG_MODE_EXP_2							0x00000802
+#define   NV10TCL_FOG_MODE_EXP2								0x00000803
+#define   NV10TCL_FOG_MODE_LINEAR							0x00000804
+#define   NV10TCL_FOG_MODE_LINEAR_2							0x00002601
+#define  NV10TCL_FOG_COORD_DIST								0x000002a0
+#define  NV10TCL_FOG_ENABLE								0x000002a4
+#define  NV10TCL_FOG_COLOR								0x000002a8
+#define   NV10TCL_FOG_COLOR_R_SHIFT							0
+#define   NV10TCL_FOG_COLOR_R_MASK							0x000000ff
+#define   NV10TCL_FOG_COLOR_G_SHIFT							8
+#define   NV10TCL_FOG_COLOR_G_MASK							0x0000ff00
+#define   NV10TCL_FOG_COLOR_B_SHIFT							16
+#define   NV10TCL_FOG_COLOR_B_MASK							0x00ff0000
+#define   NV10TCL_FOG_COLOR_A_SHIFT							24
+#define   NV10TCL_FOG_COLOR_A_MASK							0xff000000
+#define  NV10TCL_VIEWPORT_CLIP_MODE							0x000002b4
+#define  NV10TCL_VIEWPORT_CLIP_HORIZ(x)							(0x000002c0+((x)*4))
+#define  NV10TCL_VIEWPORT_CLIP_HORIZ__SIZE						0x00000008
+#define   NV10TCL_VIEWPORT_CLIP_HORIZ_CLIP_L_SHIFT					0
+#define   NV10TCL_VIEWPORT_CLIP_HORIZ_CLIP_L_MASK					0x000007ff
+#define   NV10TCL_VIEWPORT_CLIP_HORIZ_CLIP_LEFT_ENABLE					(1 << 11)
+#define   NV10TCL_VIEWPORT_CLIP_HORIZ_CLIP_R_SHIFT					16
+#define   NV10TCL_VIEWPORT_CLIP_HORIZ_CLIP_R_MASK					0x07ff0000
+#define   NV10TCL_VIEWPORT_CLIP_HORIZ_CLIP_RIGHT_ENABLE					(1 << 27)
+#define  NV10TCL_VIEWPORT_CLIP_VERT(x)							(0x000002e0+((x)*4))
+#define  NV10TCL_VIEWPORT_CLIP_VERT__SIZE						0x00000008
+#define   NV10TCL_VIEWPORT_CLIP_VERT_CLIP_T_SHIFT					0
+#define   NV10TCL_VIEWPORT_CLIP_VERT_CLIP_T_MASK					0x000007ff
+#define   NV10TCL_VIEWPORT_CLIP_VERT_CLIP_TOP_ENABLE					(1 << 11)
+#define   NV10TCL_VIEWPORT_CLIP_VERT_CLIP_B_SHIFT					16
+#define   NV10TCL_VIEWPORT_CLIP_VERT_CLIP_B_MASK					0x07ff0000
+#define   NV10TCL_VIEWPORT_CLIP_VERT_CLIP_BOTTOM_ENABLE					(1 << 27)
+#define  NV10TCL_ALPHA_FUNC_ENABLE							0x00000300
+#define  NV10TCL_BLEND_FUNC_ENABLE							0x00000304
+#define  NV10TCL_CULL_FACE_ENABLE							0x00000308
+#define  NV10TCL_DEPTH_TEST_ENABLE							0x0000030c
+#define  NV10TCL_DITHER_ENABLE								0x00000310
+#define  NV10TCL_LIGHTING_ENABLE							0x00000314
+#define  NV10TCL_POINT_PARAMETERS_ENABLE						0x00000318
+#define  NV10TCL_POINT_SMOOTH_ENABLE							0x0000031c
+#define  NV10TCL_LINE_SMOOTH_ENABLE							0x00000320
+#define  NV10TCL_POLYGON_SMOOTH_ENABLE							0x00000324
+#define  NV10TCL_VERTEX_WEIGHT_ENABLE							0x00000328
+#define  NV10TCL_STENCIL_ENABLE								0x0000032c
+#define  NV10TCL_POLYGON_OFFSET_POINT_ENABLE						0x00000330
+#define  NV10TCL_POLYGON_OFFSET_LINE_ENABLE						0x00000334
+#define  NV10TCL_POLYGON_OFFSET_FILL_ENABLE						0x00000338
+#define  NV10TCL_ALPHA_FUNC_FUNC							0x0000033c
+#define   NV10TCL_ALPHA_FUNC_FUNC_NEVER							0x00000200
+#define   NV10TCL_ALPHA_FUNC_FUNC_LESS							0x00000201
+#define   NV10TCL_ALPHA_FUNC_FUNC_EQUAL							0x00000202
+#define   NV10TCL_ALPHA_FUNC_FUNC_LEQUAL						0x00000203
+#define   NV10TCL_ALPHA_FUNC_FUNC_GREATER						0x00000204
+#define   NV10TCL_ALPHA_FUNC_FUNC_GREATER						0x00000204
+#define   NV10TCL_ALPHA_FUNC_FUNC_NOTEQUAL						0x00000205
+#define   NV10TCL_ALPHA_FUNC_FUNC_GEQUAL						0x00000206
+#define   NV10TCL_ALPHA_FUNC_FUNC_ALWAYS						0x00000207
+#define  NV10TCL_ALPHA_FUNC_REF								0x00000340
+#define  NV10TCL_BLEND_FUNC_SRC								0x00000344
+#define   NV10TCL_BLEND_FUNC_SRC_ZERO							0x00000000
+#define   NV10TCL_BLEND_FUNC_SRC_ONE							0x00000001
+#define   NV10TCL_BLEND_FUNC_SRC_SRC_COLOR						0x00000300
+#define   NV10TCL_BLEND_FUNC_SRC_ONE_MINUS_SRC_COLOR					0x00000301
+#define   NV10TCL_BLEND_FUNC_SRC_SRC_ALPHA						0x00000302
+#define   NV10TCL_BLEND_FUNC_SRC_ONE_MINUS_SRC_ALPHA					0x00000303
+#define   NV10TCL_BLEND_FUNC_SRC_DST_ALPHA						0x00000304
+#define   NV10TCL_BLEND_FUNC_SRC_ONE_MINUS_DST_ALPHA					0x00000305
+#define   NV10TCL_BLEND_FUNC_SRC_DST_COLOR						0x00000306
+#define   NV10TCL_BLEND_FUNC_SRC_ONE_MINUS_DST_COLOR					0x00000307
+#define   NV10TCL_BLEND_FUNC_SRC_SRC_ALPHA_SATURATE					0x00000308
+#define   NV10TCL_BLEND_FUNC_SRC_CONSTANT_COLOR						0x00008001
+#define   NV10TCL_BLEND_FUNC_SRC_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define   NV10TCL_BLEND_FUNC_SRC_CONSTANT_ALPHA						0x00008003
+#define   NV10TCL_BLEND_FUNC_SRC_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define  NV10TCL_BLEND_FUNC_DST								0x00000348
+#define   NV10TCL_BLEND_FUNC_DST_ZERO							0x00000000
+#define   NV10TCL_BLEND_FUNC_DST_ONE							0x00000001
+#define   NV10TCL_BLEND_FUNC_DST_SRC_COLOR						0x00000300
+#define   NV10TCL_BLEND_FUNC_DST_ONE_MINUS_SRC_COLOR					0x00000301
+#define   NV10TCL_BLEND_FUNC_DST_SRC_ALPHA						0x00000302
+#define   NV10TCL_BLEND_FUNC_DST_ONE_MINUS_SRC_ALPHA					0x00000303
+#define   NV10TCL_BLEND_FUNC_DST_DST_ALPHA						0x00000304
+#define   NV10TCL_BLEND_FUNC_DST_ONE_MINUS_DST_ALPHA					0x00000305
+#define   NV10TCL_BLEND_FUNC_DST_DST_COLOR						0x00000306
+#define   NV10TCL_BLEND_FUNC_DST_ONE_MINUS_DST_COLOR					0x00000307
+#define   NV10TCL_BLEND_FUNC_DST_SRC_ALPHA_SATURATE					0x00000308
+#define   NV10TCL_BLEND_FUNC_DST_CONSTANT_COLOR						0x00008001
+#define   NV10TCL_BLEND_FUNC_DST_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define   NV10TCL_BLEND_FUNC_DST_CONSTANT_ALPHA						0x00008003
+#define   NV10TCL_BLEND_FUNC_DST_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define  NV10TCL_BLEND_COLOR								0x0000034c
+#define   NV10TCL_BLEND_COLOR_B_SHIFT							0
+#define   NV10TCL_BLEND_COLOR_B_MASK							0x000000ff
+#define   NV10TCL_BLEND_COLOR_G_SHIFT							8
+#define   NV10TCL_BLEND_COLOR_G_MASK							0x0000ff00
+#define   NV10TCL_BLEND_COLOR_R_SHIFT							16
+#define   NV10TCL_BLEND_COLOR_R_MASK							0x00ff0000
+#define   NV10TCL_BLEND_COLOR_A_SHIFT							24
+#define   NV10TCL_BLEND_COLOR_A_MASK							0xff000000
+#define  NV10TCL_BLEND_EQUATION								0x00000350
+#define   NV10TCL_BLEND_EQUATION_FUNC_ADD						0x00008006
+#define   NV10TCL_BLEND_EQUATION_MIN							0x00008007
+#define   NV10TCL_BLEND_EQUATION_MAX							0x00008008
+#define   NV10TCL_BLEND_EQUATION_FUNC_SUBTRACT						0x0000800a
+#define   NV10TCL_BLEND_EQUATION_FUNC_REVERSE_SUBTRACT					0x0000800b
+#define  NV10TCL_DEPTH_FUNC								0x00000354
+#define   NV10TCL_DEPTH_FUNC_NEVER							0x00000200
+#define   NV10TCL_DEPTH_FUNC_LESS							0x00000201
+#define   NV10TCL_DEPTH_FUNC_EQUAL							0x00000202
+#define   NV10TCL_DEPTH_FUNC_LEQUAL							0x00000203
+#define   NV10TCL_DEPTH_FUNC_GREATER							0x00000204
+#define   NV10TCL_DEPTH_FUNC_GREATER							0x00000204
+#define   NV10TCL_DEPTH_FUNC_NOTEQUAL							0x00000205
+#define   NV10TCL_DEPTH_FUNC_GEQUAL							0x00000206
+#define   NV10TCL_DEPTH_FUNC_ALWAYS							0x00000207
+#define  NV10TCL_COLOR_MASK								0x00000358
+#define   NV10TCL_COLOR_MASK_B								(1 <<  0)
+#define   NV10TCL_COLOR_MASK_G								(1 <<  8)
+#define   NV10TCL_COLOR_MASK_R								(1 << 16)
+#define   NV10TCL_COLOR_MASK_A								(1 << 24)
+#define  NV10TCL_DEPTH_WRITE_ENABLE							0x0000035c
+#define  NV10TCL_STENCIL_MASK								0x00000360
+#define  NV10TCL_STENCIL_FUNC_FUNC							0x00000364
+#define   NV10TCL_STENCIL_FUNC_FUNC_NEVER						0x00000200
+#define   NV10TCL_STENCIL_FUNC_FUNC_LESS						0x00000201
+#define   NV10TCL_STENCIL_FUNC_FUNC_EQUAL						0x00000202
+#define   NV10TCL_STENCIL_FUNC_FUNC_LEQUAL						0x00000203
+#define   NV10TCL_STENCIL_FUNC_FUNC_GREATER						0x00000204
+#define   NV10TCL_STENCIL_FUNC_FUNC_GREATER						0x00000204
+#define   NV10TCL_STENCIL_FUNC_FUNC_NOTEQUAL						0x00000205
+#define   NV10TCL_STENCIL_FUNC_FUNC_GEQUAL						0x00000206
+#define   NV10TCL_STENCIL_FUNC_FUNC_ALWAYS						0x00000207
+#define  NV10TCL_STENCIL_FUNC_REF							0x00000368
+#define  NV10TCL_STENCIL_FUNC_MASK							0x0000036c
+#define  NV10TCL_STENCIL_OP_FAIL							0x00000370
+#define   NV10TCL_STENCIL_OP_FAIL_ZERO							0x00000000
+#define   NV10TCL_STENCIL_OP_FAIL_INVERT						0x0000150a
+#define   NV10TCL_STENCIL_OP_FAIL_KEEP							0x00001e00
+#define   NV10TCL_STENCIL_OP_FAIL_REPLACE						0x00001e01
+#define   NV10TCL_STENCIL_OP_FAIL_INCR							0x00001e02
+#define   NV10TCL_STENCIL_OP_FAIL_DECR							0x00001e03
+#define   NV10TCL_STENCIL_OP_FAIL_INCR_WRAP						0x00008507
+#define   NV10TCL_STENCIL_OP_FAIL_DECR_WRAP						0x00008508
+#define  NV10TCL_STENCIL_OP_ZFAIL							0x00000374
+#define   NV10TCL_STENCIL_OP_ZFAIL_ZERO							0x00000000
+#define   NV10TCL_STENCIL_OP_ZFAIL_INVERT						0x0000150a
+#define   NV10TCL_STENCIL_OP_ZFAIL_KEEP							0x00001e00
+#define   NV10TCL_STENCIL_OP_ZFAIL_REPLACE						0x00001e01
+#define   NV10TCL_STENCIL_OP_ZFAIL_INCR							0x00001e02
+#define   NV10TCL_STENCIL_OP_ZFAIL_DECR							0x00001e03
+#define   NV10TCL_STENCIL_OP_ZFAIL_INCR_WRAP						0x00008507
+#define   NV10TCL_STENCIL_OP_ZFAIL_DECR_WRAP						0x00008508
+#define  NV10TCL_STENCIL_OP_ZPASS							0x00000378
+#define   NV10TCL_STENCIL_OP_ZPASS_ZERO							0x00000000
+#define   NV10TCL_STENCIL_OP_ZPASS_INVERT						0x0000150a
+#define   NV10TCL_STENCIL_OP_ZPASS_KEEP							0x00001e00
+#define   NV10TCL_STENCIL_OP_ZPASS_REPLACE						0x00001e01
+#define   NV10TCL_STENCIL_OP_ZPASS_INCR							0x00001e02
+#define   NV10TCL_STENCIL_OP_ZPASS_DECR							0x00001e03
+#define   NV10TCL_STENCIL_OP_ZPASS_INCR_WRAP						0x00008507
+#define   NV10TCL_STENCIL_OP_ZPASS_DECR_WRAP						0x00008508
+#define  NV10TCL_SHADE_MODEL								0x0000037c
+#define   NV10TCL_SHADE_MODEL_FLAT							0x00001d00
+#define   NV10TCL_SHADE_MODEL_SMOOTH							0x00001d01
+#define  NV10TCL_LINE_WIDTH								0x00000380
+#define  NV10TCL_POLYGON_OFFSET_FACTOR							0x00000384
+#define  NV10TCL_POLYGON_OFFSET_UNITS							0x00000388
+#define  NV10TCL_POLYGON_MODE_FRONT							0x0000038c
+#define   NV10TCL_POLYGON_MODE_FRONT_POINT						0x00001b00
+#define   NV10TCL_POLYGON_MODE_FRONT_LINE						0x00001b01
+#define   NV10TCL_POLYGON_MODE_FRONT_FILL						0x00001b02
+#define  NV10TCL_POLYGON_MODE_BACK							0x00000390
+#define   NV10TCL_POLYGON_MODE_BACK_POINT						0x00001b00
+#define   NV10TCL_POLYGON_MODE_BACK_LINE						0x00001b01
+#define   NV10TCL_POLYGON_MODE_BACK_FILL						0x00001b02
+#define  NV10TCL_DEPTH_RANGE_NEAR							0x00000394
+#define  NV10TCL_DEPTH_RANGE_FAR							0x00000398
+#define  NV10TCL_CULL_FACE								0x0000039c
+#define   NV10TCL_CULL_FACE_FRONT							0x00000404
+#define   NV10TCL_CULL_FACE_BACK							0x00000405
+#define   NV10TCL_CULL_FACE_FRONT_AND_BACK						0x00000408
+#define  NV10TCL_FRONT_FACE								0x000003a0
+#define   NV10TCL_FRONT_FACE_CW								0x00000900
+#define   NV10TCL_FRONT_FACE_CCW							0x00000901
+#define  NV10TCL_NORMALIZE_ENABLE							0x000003a4
+#define  NV10TCL_COLOR_MATERIAL_R							0x000003a8
+#define  NV10TCL_COLOR_MATERIAL_G							0x000003ac
+#define  NV10TCL_COLOR_MATERIAL_B							0x000003b0
+#define  NV10TCL_COLOR_MATERIAL_A							0x000003b4
+#define  NV10TCL_COLOR_CONTROL								0x000003b8
+#define  NV10TCL_ENABLED_LIGHTS								0x000003bc
+#define   NV10TCL_ENABLED_LIGHTS_LIGHT0							(1 <<  0)
+#define   NV10TCL_ENABLED_LIGHTS_LIGHT1							(1 <<  2)
+#define   NV10TCL_ENABLED_LIGHTS_LIGHT2							(1 <<  4)
+#define   NV10TCL_ENABLED_LIGHTS_LIGHT3							(1 <<  6)
+#define   NV10TCL_ENABLED_LIGHTS_LIGHT4							(1 <<  8)
+#define   NV10TCL_ENABLED_LIGHTS_LIGHT5							(1 << 10)
+#define   NV10TCL_ENABLED_LIGHTS_LIGHT6							(1 << 12)
+#define   NV10TCL_ENABLED_LIGHTS_LIGHT7							(1 << 14)
+#define  NV10TCL_CLIP_PLANE_ENABLE(x)							(0x000003c0+((x)*4))
+#define  NV10TCL_CLIP_PLANE_ENABLE__SIZE						0x00000008
+#define   NV10TCL_CLIP_PLANE_ENABLE_FALSE						0x00000000
+#define   NV10TCL_CLIP_PLANE_ENABLE_EYE_LINEAR						0x00002400
+#define   NV10TCL_CLIP_PLANE_ENABLE_OBJECT_LINEAR					0x00002401
+#define  NV10TCL_TX_MATRIX_ENABLE(x)							(0x000003e0+((x)*4))
+#define  NV10TCL_TX_MATRIX_ENABLE__SIZE							0x00000002
+#define  NV10TCL_VIEW_MATRIX_ENABLE							0x000003e8
+#define   NV10TCL_VIEW_MATRIX_ENABLE_MODELVIEW1						(1 <<  0)
+#define   NV10TCL_VIEW_MATRIX_ENABLE_MODELVIEW0						(1 <<  1)
+#define   NV10TCL_VIEW_MATRIX_ENABLE_PROJECTION						(1 <<  2)
+#define  NV10TCL_POINT_SIZE								0x000003ec
+#define  NV10TCL_MODELVIEW0_MATRIX(x)							(0x00000400+((x)*4))
+#define  NV10TCL_MODELVIEW0_MATRIX__SIZE						0x00000010
+#define  NV10TCL_MODELVIEW1_MATRIX(x)							(0x00000440+((x)*4))
+#define  NV10TCL_MODELVIEW1_MATRIX__SIZE						0x00000010
+#define  NV10TCL_INVERSE_MODELVIEW0_MATRIX(x)						(0x00000480+((x)*4))
+#define  NV10TCL_INVERSE_MODELVIEW0_MATRIX__SIZE					0x00000010
+#define  NV10TCL_INVERSE_MODELVIEW1_MATRIX(x)						(0x000004c0+((x)*4))
+#define  NV10TCL_INVERSE_MODELVIEW1_MATRIX__SIZE					0x00000010
+#define  NV10TCL_PROJECTION_MATRIX(x)							(0x00000500+((x)*4))
+#define  NV10TCL_PROJECTION_MATRIX__SIZE						0x00000010
+#define  NV10TCL_TX0_MATRIX(x)								(0x00000540+((x)*4))
+#define  NV10TCL_TX0_MATRIX__SIZE							0x00000010
+#define  NV10TCL_TX1_MATRIX(x)								(0x00000580+((x)*4))
+#define  NV10TCL_TX1_MATRIX__SIZE							0x00000010
+#define  NV10TCL_CLIP_PLANE_A(x)							(0x00000600+((x)*16))
+#define  NV10TCL_CLIP_PLANE_A__SIZE							0x00000008
+#define  NV10TCL_CLIP_PLANE_B(x)							(0x00000604+((x)*16))
+#define  NV10TCL_CLIP_PLANE_B__SIZE							0x00000008
+#define  NV10TCL_CLIP_PLANE_C(x)							(0x00000608+((x)*16))
+#define  NV10TCL_CLIP_PLANE_C__SIZE							0x00000008
+#define  NV10TCL_CLIP_PLANE_D(x)							(0x0000060c+((x)*16))
+#define  NV10TCL_CLIP_PLANE_D__SIZE							0x00000008
+#define  NV10TCL_FOG_EQUATION_CONSTANT							0x00000680
+#define  NV10TCL_FOG_EQUATION_LINEAR							0x00000684
+#define  NV10TCL_FOG_EQUATION_QUADRATIC							0x00000688
+#define  NV10TCL_FRONT_MATERIAL_SHININESS(x)						(0x000006a0+((x)*4))
+#define  NV10TCL_FRONT_MATERIAL_SHININESS__SIZE						0x00000006
+#define  NV10TCL_LIGHT_MODEL_FRONT_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_R			0x000006c4
+#define  NV10TCL_LIGHT_MODEL_FRONT_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_G			0x000006c8
+#define  NV10TCL_LIGHT_MODEL_FRONT_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_B			0x000006cc
+#define  NV10TCL_VIEWPORT_SCALE_X							0x000006e8
+#define  NV10TCL_VIEWPORT_SCALE_Y							0x000006ec
+#define  NV10TCL_VIEWPORT_SCALE_Z							0x000006f0
+#define  NV10TCL_VIEWPORT_SCALE_W							0x000006f4
+#define  NV10TCL_POINT_PARAMETER(x)							(0x000006f8+((x)*4))
+#define  NV10TCL_POINT_PARAMETER__SIZE							0x00000008
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_R(x)					(0x00000800+((x)*128))
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_R__SIZE				0x00000008
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_G(x)					(0x00000804+((x)*128))
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_G__SIZE				0x00000008
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_B(x)					(0x00000808+((x)*128))
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_B__SIZE				0x00000008
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_R(x)					(0x0000080c+((x)*128))
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_R__SIZE				0x00000008
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_G(x)					(0x00000810+((x)*128))
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_G__SIZE				0x00000008
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_B(x)					(0x00000814+((x)*128))
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_B__SIZE				0x00000008
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_R(x)					(0x00000818+((x)*128))
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_R__SIZE				0x00000008
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_G(x)					(0x0000081c+((x)*128))
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_G__SIZE				0x00000008
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_B(x)					(0x00000820+((x)*128))
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_B__SIZE				0x00000008
+#define  NV10TCL_LIGHT_HALF_VECTOR_X(x)							(0x00000828+((x)*128))
+#define  NV10TCL_LIGHT_HALF_VECTOR_X__SIZE						0x00000008
+#define  NV10TCL_LIGHT_HALF_VECTOR_Y(x)							(0x0000082c+((x)*128))
+#define  NV10TCL_LIGHT_HALF_VECTOR_Y__SIZE						0x00000008
+#define  NV10TCL_LIGHT_HALF_VECTOR_Z(x)							(0x00000830+((x)*128))
+#define  NV10TCL_LIGHT_HALF_VECTOR_Z__SIZE						0x00000008
+#define  NV10TCL_LIGHT_DIRECTION_X(x)							(0x00000834+((x)*128))
+#define  NV10TCL_LIGHT_DIRECTION_X__SIZE						0x00000008
+#define  NV10TCL_LIGHT_DIRECTION_Y(x)							(0x00000838+((x)*128))
+#define  NV10TCL_LIGHT_DIRECTION_Y__SIZE						0x00000008
+#define  NV10TCL_LIGHT_DIRECTION_Z(x)							(0x0000083c+((x)*128))
+#define  NV10TCL_LIGHT_DIRECTION_Z__SIZE						0x00000008
+#define  NV10TCL_LIGHT_SPOT_CUTOFF_A(x)							(0x00000840+((x)*128))
+#define  NV10TCL_LIGHT_SPOT_CUTOFF_A__SIZE						0x00000008
+#define  NV10TCL_LIGHT_SPOT_CUTOFF_B(x)							(0x00000844+((x)*128))
+#define  NV10TCL_LIGHT_SPOT_CUTOFF_B__SIZE						0x00000008
+#define  NV10TCL_LIGHT_SPOT_CUTOFF_C(x)							(0x00000848+((x)*128))
+#define  NV10TCL_LIGHT_SPOT_CUTOFF_C__SIZE						0x00000008
+#define  NV10TCL_LIGHT_SPOT_DIR_X(x)							(0x0000084c+((x)*128))
+#define  NV10TCL_LIGHT_SPOT_DIR_X__SIZE							0x00000008
+#define  NV10TCL_LIGHT_SPOT_DIR_Y(x)							(0x00000850+((x)*128))
+#define  NV10TCL_LIGHT_SPOT_DIR_Y__SIZE							0x00000008
+#define  NV10TCL_LIGHT_SPOT_DIR_Z(x)							(0x00000854+((x)*128))
+#define  NV10TCL_LIGHT_SPOT_DIR_Z__SIZE							0x00000008
+#define  NV10TCL_LIGHT_SPOT_CUTOFF_D(x)							(0x00000858+((x)*128))
+#define  NV10TCL_LIGHT_SPOT_CUTOFF_D__SIZE						0x00000008
+#define  NV10TCL_LIGHT_POSITION_X(x)							(0x0000085c+((x)*128))
+#define  NV10TCL_LIGHT_POSITION_X__SIZE							0x00000008
+#define  NV10TCL_LIGHT_POSITION_Y(x)							(0x00000860+((x)*128))
+#define  NV10TCL_LIGHT_POSITION_Y__SIZE							0x00000008
+#define  NV10TCL_LIGHT_POSITION_Z(x)							(0x00000864+((x)*128))
+#define  NV10TCL_LIGHT_POSITION_Z__SIZE							0x00000008
+#define  NV10TCL_LIGHT_ATTENUATION_CONSTANT(x)						(0x00000868+((x)*128))
+#define  NV10TCL_LIGHT_ATTENUATION_CONSTANT__SIZE					0x00000008
+#define  NV10TCL_LIGHT_ATTENUATION_LINEAR(x)						(0x0000086c+((x)*128))
+#define  NV10TCL_LIGHT_ATTENUATION_LINEAR__SIZE						0x00000008
+#define  NV10TCL_LIGHT_ATTENUATION_QUADRATIC(x)						(0x00000870+((x)*128))
+#define  NV10TCL_LIGHT_ATTENUATION_QUADRATIC__SIZE					0x00000008
+#define  NV10TCL_VERTEX_POS_3F_X							0x00000c00
+#define  NV10TCL_VERTEX_POS_3F_Y							0x00000c04
+#define  NV10TCL_VERTEX_POS_3F_Z							0x00000c08
+#define  NV10TCL_VERTEX_POS_4F_X							0x00000c18
+#define  NV10TCL_VERTEX_POS_4F_Y							0x00000c1c
+#define  NV10TCL_VERTEX_POS_4F_Z							0x00000c20
+#define  NV10TCL_VERTEX_POS_4F_W							0x00000c24
+#define  NV10TCL_VERTEX_NOR_3F_X							0x00000c30
+#define  NV10TCL_VERTEX_NOR_3F_Y							0x00000c34
+#define  NV10TCL_VERTEX_NOR_3F_Z							0x00000c38
+#define  NV10TCL_VERTEX_NOR_3I_XY							0x00000c40
+#define   NV10TCL_VERTEX_NOR_3I_XY_X_SHIFT						0
+#define   NV10TCL_VERTEX_NOR_3I_XY_X_MASK						0x0000ffff
+#define   NV10TCL_VERTEX_NOR_3I_XY_Y_SHIFT						16
+#define   NV10TCL_VERTEX_NOR_3I_XY_Y_MASK						0xffff0000
+#define  NV10TCL_VERTEX_NOR_3I_Z							0x00000c44
+#define   NV10TCL_VERTEX_NOR_3I_Z_Z_SHIFT						0
+#define   NV10TCL_VERTEX_NOR_3I_Z_Z_MASK						0x0000ffff
+#define  NV10TCL_VERTEX_COL_4F_R							0x00000c50
+#define  NV10TCL_VERTEX_COL_4F_G							0x00000c54
+#define  NV10TCL_VERTEX_COL_4F_B							0x00000c58
+#define  NV10TCL_VERTEX_COL_4F_A							0x00000c5c
+#define  NV10TCL_VERTEX_COL_3F_R							0x00000c60
+#define  NV10TCL_VERTEX_COL_3F_G							0x00000c64
+#define  NV10TCL_VERTEX_COL_3F_B							0x00000c68
+#define  NV10TCL_VERTEX_COL_4I								0x00000c6c
+#define   NV10TCL_VERTEX_COL_4I_R_SHIFT							0
+#define   NV10TCL_VERTEX_COL_4I_R_MASK							0x000000ff
+#define   NV10TCL_VERTEX_COL_4I_G_SHIFT							8
+#define   NV10TCL_VERTEX_COL_4I_G_MASK							0x0000ff00
+#define   NV10TCL_VERTEX_COL_4I_B_SHIFT							16
+#define   NV10TCL_VERTEX_COL_4I_B_MASK							0x00ff0000
+#define   NV10TCL_VERTEX_COL_4I_A_SHIFT							24
+#define   NV10TCL_VERTEX_COL_4I_A_MASK							0xff000000
+#define  NV10TCL_VERTEX_COL2_3F_R							0x00000c80
+#define  NV10TCL_VERTEX_COL2_3F_G							0x00000c84
+#define  NV10TCL_VERTEX_COL2_3F_B							0x00000c88
+#define  NV10TCL_VERTEX_COL2_3I								0x00000c8c
+#define   NV10TCL_VERTEX_COL2_3I_R_SHIFT						0
+#define   NV10TCL_VERTEX_COL2_3I_R_MASK							0x000000ff
+#define   NV10TCL_VERTEX_COL2_3I_G_SHIFT						8
+#define   NV10TCL_VERTEX_COL2_3I_G_MASK							0x0000ff00
+#define   NV10TCL_VERTEX_COL2_3I_B_SHIFT						16
+#define   NV10TCL_VERTEX_COL2_3I_B_MASK							0x00ff0000
+#define  NV10TCL_VERTEX_TX0_2F_S							0x00000c90
+#define  NV10TCL_VERTEX_TX0_2F_T							0x00000c94
+#define  NV10TCL_VERTEX_TX0_2I								0x00000c98
+#define   NV10TCL_VERTEX_TX0_2I_S_SHIFT							0
+#define   NV10TCL_VERTEX_TX0_2I_S_MASK							0x0000ffff
+#define   NV10TCL_VERTEX_TX0_2I_T_SHIFT							16
+#define   NV10TCL_VERTEX_TX0_2I_T_MASK							0xffff0000
+#define  NV10TCL_VERTEX_TX0_4F_S							0x00000ca0
+#define  NV10TCL_VERTEX_TX0_4F_T							0x00000ca4
+#define  NV10TCL_VERTEX_TX0_4F_R							0x00000ca8
+#define  NV10TCL_VERTEX_TX0_4F_Q							0x00000cac
+#define  NV10TCL_VERTEX_TX0_4I_ST							0x00000cb0
+#define   NV10TCL_VERTEX_TX0_4I_ST_S_SHIFT						0
+#define   NV10TCL_VERTEX_TX0_4I_ST_S_MASK						0x0000ffff
+#define   NV10TCL_VERTEX_TX0_4I_ST_T_SHIFT						16
+#define   NV10TCL_VERTEX_TX0_4I_ST_T_MASK						0xffff0000
+#define  NV10TCL_VERTEX_TX0_4I_RQ							0x00000cb4
+#define   NV10TCL_VERTEX_TX0_4I_RQ_R_SHIFT						0
+#define   NV10TCL_VERTEX_TX0_4I_RQ_R_MASK						0x0000ffff
+#define   NV10TCL_VERTEX_TX0_4I_RQ_Q_SHIFT						16
+#define   NV10TCL_VERTEX_TX0_4I_RQ_Q_MASK						0xffff0000
+#define  NV10TCL_VERTEX_TX1_2F_S							0x00000cb8
+#define  NV10TCL_VERTEX_TX1_2F_T							0x00000cbc
+#define  NV10TCL_VERTEX_TX1_2I								0x00000cc0
+#define   NV10TCL_VERTEX_TX1_2I_S_SHIFT							0
+#define   NV10TCL_VERTEX_TX1_2I_S_MASK							0x0000ffff
+#define   NV10TCL_VERTEX_TX1_2I_T_SHIFT							16
+#define   NV10TCL_VERTEX_TX1_2I_T_MASK							0xffff0000
+#define  NV10TCL_VERTEX_TX1_4F_S							0x00000cc8
+#define  NV10TCL_VERTEX_TX1_4F_T							0x00000ccc
+#define  NV10TCL_VERTEX_TX1_4F_R							0x00000cd0
+#define  NV10TCL_VERTEX_TX1_4F_Q							0x00000cd4
+#define  NV10TCL_VERTEX_TX1_4I_ST							0x00000cd8
+#define   NV10TCL_VERTEX_TX1_4I_ST_S_SHIFT						0
+#define   NV10TCL_VERTEX_TX1_4I_ST_S_MASK						0x0000ffff
+#define   NV10TCL_VERTEX_TX1_4I_ST_T_SHIFT						16
+#define   NV10TCL_VERTEX_TX1_4I_ST_T_MASK						0xffff0000
+#define  NV10TCL_VERTEX_TX1_4I_RQ							0x00000cdc
+#define   NV10TCL_VERTEX_TX1_4I_RQ_R_SHIFT						0
+#define   NV10TCL_VERTEX_TX1_4I_RQ_R_MASK						0x0000ffff
+#define   NV10TCL_VERTEX_TX1_4I_RQ_Q_SHIFT						16
+#define   NV10TCL_VERTEX_TX1_4I_RQ_Q_MASK						0xffff0000
+#define  NV10TCL_VERTEX_FOG_1F								0x00000ce0
+#define  NV10TCL_VERTEX_WGH_1F								0x00000ce4
+#define  NV10TCL_EDGEFLAG_ENABLE							0x00000cec
+#define  NV10TCL_VERTEX_ARRAY_VALIDATE							0x00000cf0
+#define  NV10TCL_VERTEX_ARRAY_ATTRIB_OFFSET(x)						(0x00000d00+((x)*8))
+#define  NV10TCL_VERTEX_ARRAY_ATTRIB_OFFSET__SIZE					0x00000008
+#define  NV10TCL_VERTEX_ARRAY_ATTRIB_FORMAT(x)						(0x00000d04+((x)*8))
+#define  NV10TCL_VERTEX_ARRAY_ATTRIB_FORMAT__SIZE					0x00000008
+#define   NV10TCL_VERTEX_ARRAY_ATTRIB_FORMAT_TYPE_SHIFT					0
+#define   NV10TCL_VERTEX_ARRAY_ATTRIB_FORMAT_TYPE_MASK					0x0000000f
+#define   NV10TCL_VERTEX_ARRAY_ATTRIB_FORMAT_FIELDS_SHIFT				4
+#define   NV10TCL_VERTEX_ARRAY_ATTRIB_FORMAT_FIELDS_MASK				0x000000f0
+#define   NV10TCL_VERTEX_ARRAY_ATTRIB_FORMAT_STRIDE_SHIFT				8
+#define   NV10TCL_VERTEX_ARRAY_ATTRIB_FORMAT_STRIDE_MASK				0x0000ff00
+#define  NV10TCL_VERTEX_ARRAY_OFFSET_POS						0x00000d00
+#define  NV10TCL_VERTEX_ARRAY_FORMAT_POS						0x00000d04
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_POS_TYPE_SHIFT					0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_POS_TYPE_MASK					0x0000000f
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_POS_FIELDS_SHIFT					4
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_POS_FIELDS_MASK					0x000000f0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_POS_STRIDE_SHIFT					8
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_POS_STRIDE_MASK					0x0000ff00
+#define  NV10TCL_VERTEX_ARRAY_OFFSET_COL						0x00000d08
+#define  NV10TCL_VERTEX_ARRAY_FORMAT_COL						0x00000d0c
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_COL_TYPE_SHIFT					0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_COL_TYPE_MASK					0x0000000f
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_COL_FIELDS_SHIFT					4
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_COL_FIELDS_MASK					0x000000f0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_COL_STRIDE_SHIFT					8
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_COL_STRIDE_MASK					0x0000ff00
+#define  NV10TCL_VERTEX_ARRAY_OFFSET_COL2						0x00000d10
+#define  NV10TCL_VERTEX_ARRAY_FORMAT_COL2						0x00000d14
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_COL2_TYPE_SHIFT					0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_COL2_TYPE_MASK					0x0000000f
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_COL2_FIELDS_SHIFT					4
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_COL2_FIELDS_MASK					0x000000f0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_COL2_STRIDE_SHIFT					8
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_COL2_STRIDE_MASK					0x0000ff00
+#define  NV10TCL_VERTEX_ARRAY_OFFSET_TX0						0x00000d18
+#define  NV10TCL_VERTEX_ARRAY_FORMAT_TX0						0x00000d1c
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_TX0_TYPE_SHIFT					0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_TX0_TYPE_MASK					0x0000000f
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_TX0_FIELDS_SHIFT					4
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_TX0_FIELDS_MASK					0x000000f0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_TX0_STRIDE_SHIFT					8
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_TX0_STRIDE_MASK					0x0000ff00
+#define  NV10TCL_VERTEX_ARRAY_OFFSET_TX1						0x00000d20
+#define  NV10TCL_VERTEX_ARRAY_FORMAT_TX1						0x00000d24
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_TX1_TYPE_SHIFT					0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_TX1_TYPE_MASK					0x0000000f
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_TX1_FIELDS_SHIFT					4
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_TX1_FIELDS_MASK					0x000000f0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_TX1_STRIDE_SHIFT					8
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_TX1_STRIDE_MASK					0x0000ff00
+#define  NV10TCL_VERTEX_ARRAY_OFFSET_NOR						0x00000d28
+#define  NV10TCL_VERTEX_ARRAY_FORMAT_NOR						0x00000d2c
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_NOR_TYPE_SHIFT					0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_NOR_TYPE_MASK					0x0000000f
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_NOR_FIELDS_SHIFT					4
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_NOR_FIELDS_MASK					0x000000f0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_NOR_STRIDE_SHIFT					8
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_NOR_STRIDE_MASK					0x0000ff00
+#define  NV10TCL_VERTEX_ARRAY_OFFSET_WGH						0x00000d30
+#define  NV10TCL_VERTEX_ARRAY_FORMAT_WGH						0x00000d34
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_WGH_TYPE_SHIFT					0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_WGH_TYPE_MASK					0x0000000f
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_WGH_FIELDS_SHIFT					4
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_WGH_FIELDS_MASK					0x000000f0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_WGH_STRIDE_SHIFT					8
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_WGH_STRIDE_MASK					0x0000ff00
+#define  NV10TCL_VERTEX_ARRAY_OFFSET_FOG						0x00000d38
+#define  NV10TCL_VERTEX_ARRAY_FORMAT_FOG						0x00000d3c
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_FOG_TYPE_SHIFT					0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_FOG_TYPE_MASK					0x0000000f
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_FOG_FIELDS_SHIFT					4
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_FOG_FIELDS_MASK					0x000000f0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_FOG_STRIDE_SHIFT					8
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_FOG_STRIDE_MASK					0x0000ff00
+#define  NV10TCL_VERTEX_BEGIN_END							0x00000dfc
+#define   NV10TCL_VERTEX_BEGIN_END_STOP							0x00000000
+#define   NV10TCL_VERTEX_BEGIN_END_POINTS						0x00000001
+#define   NV10TCL_VERTEX_BEGIN_END_LINES						0x00000002
+#define   NV10TCL_VERTEX_BEGIN_END_LINE_LOOP						0x00000003
+#define   NV10TCL_VERTEX_BEGIN_END_LINE_STRIP						0x00000004
+#define   NV10TCL_VERTEX_BEGIN_END_TRIANGLES						0x00000005
+#define   NV10TCL_VERTEX_BEGIN_END_TRIANGLE_STRIP					0x00000006
+#define   NV10TCL_VERTEX_BEGIN_END_TRIANGLE_FAN						0x00000007
+#define   NV10TCL_VERTEX_BEGIN_END_QUADS						0x00000008
+#define   NV10TCL_VERTEX_BEGIN_END_QUAD_STRIP						0x00000009
+#define   NV10TCL_VERTEX_BEGIN_END_POLYGON						0x0000000a
+#define  NV10TCL_VB_ELEMENT_U16								0x00000e00
+#define   NV10TCL_VB_ELEMENT_U16_I0_SHIFT						0
+#define   NV10TCL_VB_ELEMENT_U16_I0_MASK						0x0000ffff
+#define   NV10TCL_VB_ELEMENT_U16_I1_SHIFT						16
+#define   NV10TCL_VB_ELEMENT_U16_I1_MASK						0xffff0000
+#define  NV10TCL_VB_ELEMENT_U32								0x00001100
+#define  NV10TCL_VERTEX_BUFFER_BEGIN_END						0x000013fc
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_STOP						0x00000000
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_POINTS					0x00000001
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_LINES						0x00000002
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_LINE_LOOP					0x00000003
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_LINE_STRIP					0x00000004
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_TRIANGLES					0x00000005
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_TRIANGLE_STRIP				0x00000006
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_TRIANGLE_FAN					0x00000007
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_QUADS						0x00000008
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_QUAD_STRIP					0x00000009
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_POLYGON					0x0000000a
+#define  NV10TCL_VERTEX_BUFFER_DRAW_ARRAYS						0x00001400
+#define   NV10TCL_VERTEX_BUFFER_DRAW_ARRAYS_FIRST_SHIFT					0
+#define   NV10TCL_VERTEX_BUFFER_DRAW_ARRAYS_FIRST_MASK					0x0000ffff
+#define   NV10TCL_VERTEX_BUFFER_DRAW_ARRAYS_LAST_SHIFT					24
+#define   NV10TCL_VERTEX_BUFFER_DRAW_ARRAYS_LAST_MASK					0xff000000
+#define  NV10TCL_VERTEX_ARRAY_DATA							0x00001800
+
+
+#define NV04_CONTEXT_COLOR_KEY								0x00000057
+
+
+
+#define NV03_CONTEXT_SURFACES_2D							0x00000058
+
+#define  NV03_CONTEXT_SURFACES_2D_SYNCHRONIZE						0x00000100
+#define  NV03_CONTEXT_SURFACES_2D_DMA_NOTIFY						0x00000180
+#define  NV03_CONTEXT_SURFACES_2D_DMA_SOURCE						0x00000184
+#define  NV03_CONTEXT_SURFACES_2D_DMA_DESTIN						0x00000188
+#define  NV03_CONTEXT_SURFACES_2D_COLOR_FORMAT						0x00000300
+#define  NV03_CONTEXT_SURFACES_2D_PITCH							0x00000304
+#define   NV03_CONTEXT_SURFACES_2D_PITCH_SOURCE_SHIFT					0
+#define   NV03_CONTEXT_SURFACES_2D_PITCH_SOURCE_MASK					0x0000ffff
+#define   NV03_CONTEXT_SURFACES_2D_PITCH_DESTIN_SHIFT					16
+#define   NV03_CONTEXT_SURFACES_2D_PITCH_DESTIN_MASK					0xffff0000
+#define  NV03_CONTEXT_SURFACES_2D_OFFSET_SOURCE						0x00000308
+#define  NV03_CONTEXT_SURFACES_2D_OFFSET_DESTIN						0x0000030c
+
+
+#define NV03_CONTEXT_SURFACES_3D							0x0000005a
+
+#define  NV03_CONTEXT_SURFACES_3D_SYNCHRONIZE						0x00000100
+#define  NV03_CONTEXT_SURFACES_3D_DMA_NOTIFY						0x00000180
+#define  NV03_CONTEXT_SURFACES_3D_DMA_SURFACE						0x00000184
+#define  NV03_CONTEXT_SURFACES_3D_PITCH							0x00000300
+#define  NV03_CONTEXT_SURFACES_3D_OFFSET_COLOR						0x00000304
+#define  NV03_CONTEXT_SURFACES_3D_OFFSET_ZETA						0x00000308
+
+
+#define NV04_RENDER_SOLID_LINE								0x0000005c
+
+#define  NV04_RENDER_SOLID_LINE_SURFACE							0x00000198
+
+
+#define NV04_RENDER_SOLID_TRIANGLE							0x0000005d
+
+
+
+#define NV04_RENDER_SOLID_RECTANGLE							0x0000005e
+
+#define  NV04_RENDER_SOLID_RECTANGLE_SURFACE						0x00000198
+
+
+#define NV04_IMAGE_BLIT									0x0000005f
+
+#define  NV04_IMAGE_BLIT_NOP								0x00000100
+#define  NV04_IMAGE_BLIT_NOTIFY								0x00000104
+#define  NV04_IMAGE_BLIT_DMA_NOTIFY							0x00000180
+#define  NV04_IMAGE_BLIT_COLOR_KEY							0x00000184
+#define  NV04_IMAGE_BLIT_CLIP_RECTANGLE							0x00000188
+#define  NV04_IMAGE_BLIT_PATTERN							0x0000018c
+#define  NV04_IMAGE_BLIT_ROP								0x00000190
+#define  NV04_IMAGE_BLIT_BETA4								0x00000198
+#define  NV04_IMAGE_BLIT_SURFACE							0x0000019c
+#define  NV04_IMAGE_BLIT_OPERATION							0x000002fc
+#define   NV04_IMAGE_BLIT_OPERATION_SRCCOPY_AND						0x00000000
+#define   NV04_IMAGE_BLIT_OPERATION_ROP_AND						0x00000001
+#define   NV04_IMAGE_BLIT_OPERATION_BLEND_AND						0x00000002
+#define   NV04_IMAGE_BLIT_OPERATION_SRCCOPY						0x00000003
+#define   NV04_IMAGE_BLIT_OPERATION_SRCCOPY_PREMULT					0x00000004
+#define   NV04_IMAGE_BLIT_OPERATION_BLEND_PREMULT					0x00000005
+
+
+#define NV04_INDEXED_IMAGE_FROM_CPU							0x00000060
+
+#define  NV04_INDEXED_IMAGE_FROM_CPU_NOP						0x00000100
+#define  NV04_INDEXED_IMAGE_FROM_CPU_NOTIFY						0x00000104
+#define  NV04_INDEXED_IMAGE_FROM_CPU_PATCH						0x0000010c
+#define  NV04_INDEXED_IMAGE_FROM_CPU_DMA_NOTIFY						0x00000180
+#define  NV04_INDEXED_IMAGE_FROM_CPU_DMA_LUT						0x00000184
+#define  NV04_INDEXED_IMAGE_FROM_CPU_COLOR_FORMAT					0x000003e8
+#define  NV04_INDEXED_IMAGE_FROM_CPU_INDEX_FORMAT					0x000003ec
+#define  NV04_INDEXED_IMAGE_FROM_CPU_LUT_OFFSET						0x000003f0
+#define  NV04_INDEXED_IMAGE_FROM_CPU_POINT						0x000003f4
+#define  NV04_INDEXED_IMAGE_FROM_CPU_SIZE_OUT						0x000003f8
+#define  NV04_INDEXED_IMAGE_FROM_CPU_SIZE_IN						0x000003fc
+#define  NV04_INDEXED_IMAGE_FROM_CPU_COLOR						0x00000400
+
+
+#define NV04_IMAGE_FROM_CPU								0x00000061
+
+#define  NV04_IMAGE_FROM_CPU_BETA4							0x00000198
+#define  NV04_IMAGE_FROM_CPU_SURFACE							0x0000019c
+
+
+#define NV10_CONTEXT_SURFACES_2D							0x00000062
+
+
+
+#define NV05_SCALED_IMAGE_FROM_MEMORY							0x00000063
+
+#define  NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION					0x000002fc
+#define   NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_DITHER				0x00000000
+#define   NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE			0x00000001
+#define   NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_SUBTR_TRUNCATE			0x00000002
+
+
+#define NV01_IMAGE_SRCCOPY_AND								0x00000064
+
+#define  NV01_IMAGE_SRCCOPY_AND_NOTIFY							0x00000104
+#define  NV01_IMAGE_SRCCOPY_AND_DMA_NOTIFY						0x00000180
+#define  NV01_IMAGE_SRCCOPY_AND_IMAGE_OUTPUT						0x00000200
+#define  NV01_IMAGE_SRCCOPY_AND_IMAGE_INPUT						0x00000204
+
+
+#define NV05_INDEXED_IMAGE_FROM_CPU							0x00000064
+
+#define  NV05_INDEXED_IMAGE_FROM_CPU_COLOR_KEY						0x00000188
+#define  NV05_INDEXED_IMAGE_FROM_CPU_CLIP_RECTANGLE					0x0000018c
+#define  NV05_INDEXED_IMAGE_FROM_CPU_PATTERN						0x00000190
+#define  NV05_INDEXED_IMAGE_FROM_CPU_ROP						0x00000194
+#define  NV05_INDEXED_IMAGE_FROM_CPU_BETA1						0x00000198
+#define  NV05_INDEXED_IMAGE_FROM_CPU_BETA4						0x0000019c
+#define  NV05_INDEXED_IMAGE_FROM_CPU_SURFACE						0x000001a0
+#define  NV05_INDEXED_IMAGE_FROM_CPU_COLOR_CONVERSION					0x000003e0
+#define  NV05_INDEXED_IMAGE_FROM_CPU_OPERATION						0x000003e4
+#define  NV05_INDEXED_IMAGE_FROM_CPU_INDICES						0x00000400
+
+
+#define NV05_IMAGE_FROM_CPU								0x00000065
+
+#define  NV05_IMAGE_FROM_CPU_BETA4							0x00000198
+#define  NV05_IMAGE_FROM_CPU_SURFACE							0x0000019c
+
+
+#define NV05_STRETCHED_IMAGE_FROM_CPU							0x00000066
+
+#define  NV05_STRETCHED_IMAGE_FROM_CPU_BETA4						0x00000194
+#define  NV05_STRETCHED_IMAGE_FROM_CPU_SURFACE						0x00000198
+#define  NV05_STRETCHED_IMAGE_FROM_CPU_COLOR_CONVERSION					0x000002f8
+
+
+#define NV04_IMAGE_BLEND_PREMULT							0x00000067
+
+#define  NV04_IMAGE_BLEND_PREMULT_NOP							0x00000100
+#define  NV04_IMAGE_BLEND_PREMULT_NOTIFY						0x00000104
+#define  NV04_IMAGE_BLEND_PREMULT_DMA_NOTIFY						0x00000180
+#define  NV04_IMAGE_BLEND_PREMULT_IMAGE_OUTPUT						0x00000200
+#define  NV04_IMAGE_BLEND_PREMULT_BETA_INPUT						0x00000204
+#define  NV04_IMAGE_BLEND_PREMULT_IMAGE_INPUT						0x00000208
+
+
+#define NV03_CHANNEL_PIO								0x0000006a
+
+
+
+#define NV03_CHANNEL_DMA								0x0000006b
+
+
+
+#define NV04_BETA_SOLID									0x00000072
+
+#define  NV04_BETA_SOLID_NOP								0x00000100
+#define  NV04_BETA_SOLID_NOTIFY								0x00000104
+#define  NV04_BETA_SOLID_DMA_NOTIFY							0x00000180
+#define  NV04_BETA_SOLID_BETA_OUTPUT							0x00000200
+#define  NV04_BETA_SOLID_BETA_FACTOR							0x00000300
+
+
+#define NV04_STRETCHED_IMAGE_FROM_CPU							0x00000076
+
+
+
+#define NV04_SCALED_IMAGE_FROM_MEMORY							0x00000077
+
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_NOP						0x00000100
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_NOTIFY						0x00000104
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_DMA_NOTIFY					0x00000180
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_DMA_IMAGE					0x00000184
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_PATTERN						0x00000188
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_ROP						0x0000018c
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_BETA1						0x00000190
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_BETA4						0x00000194
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_SURFACE						0x00000198
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION					0x000002fc
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_DITHER				0x00000000
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE			0x00000001
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_SUBTR_TRUNCATE			0x00000002
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT					0x00000300
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A1R5G5B5				0x00000001
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_X1R5G5B5				0x00000002
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A8R8G8B8				0x00000003
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_X8R8G8B8				0x00000004
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_V8YB8U8YA8				0x00000005
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_YB8V8YA8U8				0x00000006
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_R5G6B5				0x00000007
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_Y8					0x00000008
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_AY8				0x00000009
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_OPERATION					0x00000304
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY_AND				0x00000000
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OPERATION_ROP_AND				0x00000001
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OPERATION_BLEND_AND				0x00000002
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY				0x00000003
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY_PREMULT			0x00000004
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OPERATION_BLEND_PREMULT				0x00000005
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT					0x00000308
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_X_SHIFT				0
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_X_MASK				0x0000ffff
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y_SHIFT				16
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y_MASK				0xffff0000
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE					0x0000030c
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_W_SHIFT				0
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_W_MASK				0x0000ffff
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_SHIFT				16
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_MASK				0xffff0000
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_OUT_POINT					0x00000310
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_X_SHIFT				0
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_X_MASK				0x0000ffff
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_Y_SHIFT				16
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_Y_MASK				0xffff0000
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE						0x00000314
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_W_SHIFT				0
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_W_MASK					0x0000ffff
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_H_SHIFT				16
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_H_MASK					0xffff0000
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_DU_DX						0x00000318
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_DV_DY						0x0000031c
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_SIZE						0x00000400
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_SIZE_W_SHIFT					0
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_SIZE_W_MASK					0x0000ffff
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_SIZE_H_SHIFT					16
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_SIZE_H_MASK					0xffff0000
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT						0x00000404
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_PITCH_SHIFT				0
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_PITCH_MASK				0x0000ffff
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_SHIFT				16
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_MASK				0x00ff0000
+#define    NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CENTER				0x00010000
+#define    NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CORNER				0x00020000
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_SHIFT				24
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_MASK				0xff000000
+#define    NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_POINT_SAMPLE			0x00000000
+#define    NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_BILINEAR				0x01000000
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_ADDRESS						0x00000408
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_POINT						0x0000040c
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_POINT_X_SHIFT					0
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_POINT_X_MASK					0x0000ffff
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_POINT_Y_SHIFT					16
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_POINT_Y_MASK					0xffff0000
+
+
+#define NV10_TEXTURE_FROM_CPU								0x0000007b
+
+#define  NV10_TEXTURE_FROM_CPU_NOP							0x00000100
+#define  NV10_TEXTURE_FROM_CPU_NOTIFY							0x00000104
+#define  NV10_TEXTURE_FROM_CPU_WAIT_FOR_IDLE						0x00000108
+#define  NV10_TEXTURE_FROM_CPU_PM_TRIGGER						0x00000140
+#define  NV10_TEXTURE_FROM_CPU_DMA_NOTIFY						0x00000180
+#define  NV10_TEXTURE_FROM_CPU_SURFACE							0x00000184
+#define  NV10_TEXTURE_FROM_CPU_COLOR_FORMAT						0x00000300
+#define  NV10_TEXTURE_FROM_CPU_POINT							0x00000304
+#define   NV10_TEXTURE_FROM_CPU_POINT_X_SHIFT						0
+#define   NV10_TEXTURE_FROM_CPU_POINT_X_MASK						0x0000ffff
+#define   NV10_TEXTURE_FROM_CPU_POINT_Y_SHIFT						16
+#define   NV10_TEXTURE_FROM_CPU_POINT_Y_MASK						0xffff0000
+#define  NV10_TEXTURE_FROM_CPU_SIZE							0x00000308
+#define   NV10_TEXTURE_FROM_CPU_SIZE_W_SHIFT						0
+#define   NV10_TEXTURE_FROM_CPU_SIZE_W_MASK						0x0000ffff
+#define   NV10_TEXTURE_FROM_CPU_SIZE_H_SHIFT						16
+#define   NV10_TEXTURE_FROM_CPU_SIZE_H_MASK						0xffff0000
+#define  NV10_TEXTURE_FROM_CPU_CLIP_HORIZONTAL						0x0000030c
+#define   NV10_TEXTURE_FROM_CPU_CLIP_HORIZONTAL_X_SHIFT					0
+#define   NV10_TEXTURE_FROM_CPU_CLIP_HORIZONTAL_X_MASK					0x0000ffff
+#define   NV10_TEXTURE_FROM_CPU_CLIP_HORIZONTAL_W_SHIFT					16
+#define   NV10_TEXTURE_FROM_CPU_CLIP_HORIZONTAL_W_MASK					0xffff0000
+#define  NV10_TEXTURE_FROM_CPU_CLIP_VERTICAL						0x00000310
+#define   NV10_TEXTURE_FROM_CPU_CLIP_VERTICAL_Y_SHIFT					0
+#define   NV10_TEXTURE_FROM_CPU_CLIP_VERTICAL_Y_MASK					0x0000ffff
+#define   NV10_TEXTURE_FROM_CPU_CLIP_VERTICAL_H_SHIFT					16
+#define   NV10_TEXTURE_FROM_CPU_CLIP_VERTICAL_H_MASK					0xffff0000
+#define  NV10_TEXTURE_FROM_CPU_COLOR(x)							(0x00000400+((x)*4))
+#define  NV10_TEXTURE_FROM_CPU_COLOR__SIZE						0x00000700
+
+
+#define NV10_VIDEO_DISPLAY								0x0000007c
+
+
+
+#define NV10_DVD_SUBPICTURE								0x00000088
+
+
+
+#define NV10_SCALED_IMAGE_FROM_MEMORY							0x00000089
+
+#define  NV10_SCALED_IMAGE_FROM_MEMORY_WAIT_FOR_IDLE					0x00000108
+
+
+#define NV10_IMAGE_FROM_CPU								0x0000008a
+
+#define  NV10_IMAGE_FROM_CPU_COLOR_CONVERSION						0x000002f8
+
+
+#define NV10_CONTEXT_SURFACES_3D							0x00000093
+
+
+
+#define NV10_DX5_TEXTURE_TRIANGLE							0x00000094
+
+
+
+#define NV10_DX6_MULTI_TEXTURE_TRIANGLE							0x00000095
+
+
+
+#define NV11TCL										0x00000096
+
+#define  NV11TCL_COLOR_LOGIC_OP_ENABLE							0x00000d40
+#define  NV11TCL_COLOR_LOGIC_OP_OP							0x00000d44
+#define   NV11TCL_COLOR_LOGIC_OP_OP_CLEAR						0x00001500
+#define   NV11TCL_COLOR_LOGIC_OP_OP_AND							0x00001501
+#define   NV11TCL_COLOR_LOGIC_OP_OP_AND_REVERSE						0x00001502
+#define   NV11TCL_COLOR_LOGIC_OP_OP_COPY						0x00001503
+#define   NV11TCL_COLOR_LOGIC_OP_OP_AND_INVERTED					0x00001504
+#define   NV11TCL_COLOR_LOGIC_OP_OP_NOOP						0x00001505
+#define   NV11TCL_COLOR_LOGIC_OP_OP_XOR							0x00001506
+#define   NV11TCL_COLOR_LOGIC_OP_OP_OR							0x00001507
+#define   NV11TCL_COLOR_LOGIC_OP_OP_NOR							0x00001508
+#define   NV11TCL_COLOR_LOGIC_OP_OP_EQUIV						0x00001509
+#define   NV11TCL_COLOR_LOGIC_OP_OP_INVERT						0x0000150a
+#define   NV11TCL_COLOR_LOGIC_OP_OP_OR_REVERSE						0x0000150b
+#define   NV11TCL_COLOR_LOGIC_OP_OP_COPY_INVERTED					0x0000150c
+#define   NV11TCL_COLOR_LOGIC_OP_OP_OR_INVERTED						0x0000150d
+#define   NV11TCL_COLOR_LOGIC_OP_OP_NAND						0x0000150e
+#define   NV11TCL_COLOR_LOGIC_OP_OP_SET							0x0000150f
+
+
+#define NV20TCL										0x00000097
+
+#define  NV20TCL_NOP									0x00000100
+#define  NV20TCL_NOTIFY									0x00000104
+#define  NV20TCL_DMA_NOTIFY								0x00000180
+#define  NV20TCL_DMA_IN_MEMORY0								0x00000184
+#define  NV20TCL_DMA_IN_MEMORY1								0x00000188
+#define  NV20TCL_DMA_IN_MEMORY2								0x00000194
+#define  NV20TCL_DMA_IN_MEMORY3								0x00000198
+#define  NV20TCL_DMA_IN_MEMORY6								0x000001a4
+#define  NV20TCL_DMA_IN_MEMORY7								0x000001a8
+#define  NV20TCL_VIEWPORT_HORIZ								0x00000200
+#define  NV20TCL_VIEWPORT_VERT								0x00000204
+#define  NV20TCL_BUFFER_FORMAT								0x00000208
+#define  NV20TCL_BUFFER_PITCH								0x0000020c
+#define  NV20TCL_COLOR_OFFSET								0x00000210
+#define  NV20TCL_ZETA_OFFSET								0x00000214
+#define  NV20TCL_RC_IN_ALPHA(x)								(0x00000260+((x)*4))
+#define  NV20TCL_RC_IN_ALPHA__SIZE							0x00000008
+#define  NV20TCL_RC_FINAL0								0x00000288
+#define  NV20TCL_RC_FINAL1								0x0000028c
+#define  NV20TCL_LIGHT_CONTROL								0x00000294
+#define  NV20TCL_FOG_MODE								0x0000029c
+#define  NV20TCL_FOG_COORD_DIST								0x000002a0
+#define  NV20TCL_FOG_ENABLE								0x000002a4
+#define  NV20TCL_FOG_COLOR								0x000002a8
+#define  NV20TCL_VIEWPORT_CLIP_MODE							0x000002b4
+#define  NV20TCL_VIEWPORT_CLIP_HORIZ(x)							(0x000002c0+((x)*4))
+#define  NV20TCL_VIEWPORT_CLIP_HORIZ__SIZE						0x00000008
+#define  NV20TCL_VIEWPORT_CLIP_VERT(x)							(0x000002e0+((x)*4))
+#define  NV20TCL_VIEWPORT_CLIP_VERT__SIZE						0x00000008
+#define  NV20TCL_ALPHA_FUNC_ENABLE							0x00000300
+#define  NV20TCL_BLEND_FUNC_ENABLE							0x00000304
+#define  NV20TCL_CULL_FACE_ENABLE							0x00000308
+#define  NV20TCL_DEPTH_TEST_ENABLE							0x0000030c
+#define  NV20TCL_DITHER_ENABLE								0x00000310
+#define  NV20TCL_LIGHTING_ENABLE							0x00000314
+#define  NV20TCL_POINT_PARAMETERS_ENABLE						0x00000318
+#define  NV20TCL_LINE_SMOOTH_ENABLE							0x00000320
+#define  NV20TCL_POLYGON_SMOOTH_ENABLE							0x00000324
+#define  NV20TCL_STENCIL_ENABLE								0x0000032c
+#define  NV20TCL_POLYGON_OFFSET_POINT_ENABLE						0x00000330
+#define  NV20TCL_POLYGON_OFFSET_LINE_ENABLE						0x00000334
+#define  NV20TCL_POLYGON_OFFSET_FILL_ENABLE						0x00000338
+#define  NV20TCL_ALPHA_FUNC_FUNC							0x0000033c
+#define   NV20TCL_ALPHA_FUNC_FUNC_NEVER							0x00000200
+#define   NV20TCL_ALPHA_FUNC_FUNC_LESS							0x00000201
+#define   NV20TCL_ALPHA_FUNC_FUNC_EQUAL							0x00000202
+#define   NV20TCL_ALPHA_FUNC_FUNC_LEQUAL						0x00000203
+#define   NV20TCL_ALPHA_FUNC_FUNC_GREATER						0x00000204
+#define   NV20TCL_ALPHA_FUNC_FUNC_GREATER						0x00000204
+#define   NV20TCL_ALPHA_FUNC_FUNC_NOTEQUAL						0x00000205
+#define   NV20TCL_ALPHA_FUNC_FUNC_GEQUAL						0x00000206
+#define   NV20TCL_ALPHA_FUNC_FUNC_ALWAYS						0x00000207
+#define  NV20TCL_ALPHA_FUNC_REF								0x00000340
+#define  NV20TCL_BLEND_FUNC_SRC								0x00000344
+#define   NV20TCL_BLEND_FUNC_SRC_ZERO							0x00000000
+#define   NV20TCL_BLEND_FUNC_SRC_ONE							0x00000001
+#define   NV20TCL_BLEND_FUNC_SRC_SRC_COLOR						0x00000300
+#define   NV20TCL_BLEND_FUNC_SRC_ONE_MINUS_SRC_COLOR					0x00000301
+#define   NV20TCL_BLEND_FUNC_SRC_SRC_ALPHA						0x00000302
+#define   NV20TCL_BLEND_FUNC_SRC_ONE_MINUS_SRC_ALPHA					0x00000303
+#define   NV20TCL_BLEND_FUNC_SRC_DST_ALPHA						0x00000304
+#define   NV20TCL_BLEND_FUNC_SRC_ONE_MINUS_DST_ALPHA					0x00000305
+#define   NV20TCL_BLEND_FUNC_SRC_DST_COLOR						0x00000306
+#define   NV20TCL_BLEND_FUNC_SRC_ONE_MINUS_DST_COLOR					0x00000307
+#define   NV20TCL_BLEND_FUNC_SRC_SRC_ALPHA_SATURATE					0x00000308
+#define   NV20TCL_BLEND_FUNC_SRC_CONSTANT_COLOR						0x00008001
+#define   NV20TCL_BLEND_FUNC_SRC_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define   NV20TCL_BLEND_FUNC_SRC_CONSTANT_ALPHA						0x00008003
+#define   NV20TCL_BLEND_FUNC_SRC_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define  NV20TCL_BLEND_FUNC_DST								0x00000348
+#define   NV20TCL_BLEND_FUNC_DST_ZERO							0x00000000
+#define   NV20TCL_BLEND_FUNC_DST_ONE							0x00000001
+#define   NV20TCL_BLEND_FUNC_DST_SRC_COLOR						0x00000300
+#define   NV20TCL_BLEND_FUNC_DST_ONE_MINUS_SRC_COLOR					0x00000301
+#define   NV20TCL_BLEND_FUNC_DST_SRC_ALPHA						0x00000302
+#define   NV20TCL_BLEND_FUNC_DST_ONE_MINUS_SRC_ALPHA					0x00000303
+#define   NV20TCL_BLEND_FUNC_DST_DST_ALPHA						0x00000304
+#define   NV20TCL_BLEND_FUNC_DST_ONE_MINUS_DST_ALPHA					0x00000305
+#define   NV20TCL_BLEND_FUNC_DST_DST_COLOR						0x00000306
+#define   NV20TCL_BLEND_FUNC_DST_ONE_MINUS_DST_COLOR					0x00000307
+#define   NV20TCL_BLEND_FUNC_DST_SRC_ALPHA_SATURATE					0x00000308
+#define   NV20TCL_BLEND_FUNC_DST_CONSTANT_COLOR						0x00008001
+#define   NV20TCL_BLEND_FUNC_DST_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define   NV20TCL_BLEND_FUNC_DST_CONSTANT_ALPHA						0x00008003
+#define   NV20TCL_BLEND_FUNC_DST_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define  NV20TCL_BLEND_COLOR								0x0000034c
+#define  NV20TCL_BLEND_EQUATION								0x00000350
+#define   NV20TCL_BLEND_EQUATION_FUNC_ADD						0x00008006
+#define   NV20TCL_BLEND_EQUATION_MIN							0x00008007
+#define   NV20TCL_BLEND_EQUATION_MAX							0x00008008
+#define   NV20TCL_BLEND_EQUATION_FUNC_SUBTRACT						0x0000800a
+#define   NV20TCL_BLEND_EQUATION_FUNC_REVERSE_SUBTRACT					0x0000800b
+#define  NV20TCL_DEPTH_FUNC								0x00000354
+#define   NV20TCL_DEPTH_FUNC_NEVER							0x00000200
+#define   NV20TCL_DEPTH_FUNC_LESS							0x00000201
+#define   NV20TCL_DEPTH_FUNC_EQUAL							0x00000202
+#define   NV20TCL_DEPTH_FUNC_LEQUAL							0x00000203
+#define   NV20TCL_DEPTH_FUNC_GREATER							0x00000204
+#define   NV20TCL_DEPTH_FUNC_GREATER							0x00000204
+#define   NV20TCL_DEPTH_FUNC_NOTEQUAL							0x00000205
+#define   NV20TCL_DEPTH_FUNC_GEQUAL							0x00000206
+#define   NV20TCL_DEPTH_FUNC_ALWAYS							0x00000207
+#define  NV20TCL_COLOR_MASK								0x00000358
+#define  NV20TCL_DEPTH_WRITE_ENABLE							0x0000035c
+#define  NV20TCL_STENCIL_MASK								0x00000360
+#define  NV20TCL_STENCIL_FUNC_FUNC							0x00000364
+#define   NV20TCL_STENCIL_FUNC_FUNC_NEVER						0x00000200
+#define   NV20TCL_STENCIL_FUNC_FUNC_LESS						0x00000201
+#define   NV20TCL_STENCIL_FUNC_FUNC_EQUAL						0x00000202
+#define   NV20TCL_STENCIL_FUNC_FUNC_LEQUAL						0x00000203
+#define   NV20TCL_STENCIL_FUNC_FUNC_GREATER						0x00000204
+#define   NV20TCL_STENCIL_FUNC_FUNC_GREATER						0x00000204
+#define   NV20TCL_STENCIL_FUNC_FUNC_NOTEQUAL						0x00000205
+#define   NV20TCL_STENCIL_FUNC_FUNC_GEQUAL						0x00000206
+#define   NV20TCL_STENCIL_FUNC_FUNC_ALWAYS						0x00000207
+#define  NV20TCL_STENCIL_FUNC_REF							0x00000368
+#define  NV20TCL_STENCIL_FUNC_MASK							0x0000036c
+#define  NV20TCL_STENCIL_OP_FAIL							0x00000370
+#define   NV20TCL_STENCIL_OP_FAIL_ZERO							0x00000000
+#define   NV20TCL_STENCIL_OP_FAIL_INVERT						0x0000150a
+#define   NV20TCL_STENCIL_OP_FAIL_KEEP							0x00001e00
+#define   NV20TCL_STENCIL_OP_FAIL_REPLACE						0x00001e01
+#define   NV20TCL_STENCIL_OP_FAIL_INCR							0x00001e02
+#define   NV20TCL_STENCIL_OP_FAIL_DECR							0x00001e03
+#define   NV20TCL_STENCIL_OP_FAIL_INCR_WRAP						0x00008507
+#define   NV20TCL_STENCIL_OP_FAIL_DECR_WRAP						0x00008508
+#define  NV20TCL_STENCIL_OP_ZFAIL							0x00000374
+#define   NV20TCL_STENCIL_OP_ZFAIL_ZERO							0x00000000
+#define   NV20TCL_STENCIL_OP_ZFAIL_INVERT						0x0000150a
+#define   NV20TCL_STENCIL_OP_ZFAIL_KEEP							0x00001e00
+#define   NV20TCL_STENCIL_OP_ZFAIL_REPLACE						0x00001e01
+#define   NV20TCL_STENCIL_OP_ZFAIL_INCR							0x00001e02
+#define   NV20TCL_STENCIL_OP_ZFAIL_DECR							0x00001e03
+#define   NV20TCL_STENCIL_OP_ZFAIL_INCR_WRAP						0x00008507
+#define   NV20TCL_STENCIL_OP_ZFAIL_DECR_WRAP						0x00008508
+#define  NV20TCL_STENCIL_OP_ZPASS							0x00000378
+#define   NV20TCL_STENCIL_OP_ZPASS_ZERO							0x00000000
+#define   NV20TCL_STENCIL_OP_ZPASS_INVERT						0x0000150a
+#define   NV20TCL_STENCIL_OP_ZPASS_KEEP							0x00001e00
+#define   NV20TCL_STENCIL_OP_ZPASS_REPLACE						0x00001e01
+#define   NV20TCL_STENCIL_OP_ZPASS_INCR							0x00001e02
+#define   NV20TCL_STENCIL_OP_ZPASS_DECR							0x00001e03
+#define   NV20TCL_STENCIL_OP_ZPASS_INCR_WRAP						0x00008507
+#define   NV20TCL_STENCIL_OP_ZPASS_DECR_WRAP						0x00008508
+#define  NV20TCL_SHADE_MODEL								0x0000037c
+#define   NV20TCL_SHADE_MODEL_FLAT							0x00001d00
+#define   NV20TCL_SHADE_MODEL_SMOOTH							0x00001d01
+#define  NV20TCL_LINE_WIDTH								0x00000380
+#define  NV20TCL_POLYGON_OFFSET_FACTOR							0x00000384
+#define  NV20TCL_POLYGON_OFFSET_UNITS							0x00000388
+#define  NV20TCL_POLYGON_MODE_FRONT							0x0000038c
+#define   NV20TCL_POLYGON_MODE_FRONT_POINT						0x00001b00
+#define   NV20TCL_POLYGON_MODE_FRONT_LINE						0x00001b01
+#define   NV20TCL_POLYGON_MODE_FRONT_FILL						0x00001b02
+#define  NV20TCL_POLYGON_MODE_BACK							0x00000390
+#define   NV20TCL_POLYGON_MODE_BACK_POINT						0x00001b00
+#define   NV20TCL_POLYGON_MODE_BACK_LINE						0x00001b01
+#define   NV20TCL_POLYGON_MODE_BACK_FILL						0x00001b02
+#define  NV20TCL_DEPTH_RANGE_NEAR							0x00000394
+#define  NV20TCL_DEPTH_RANGE_FAR							0x00000398
+#define  NV20TCL_CULL_FACE								0x0000039c
+#define   NV20TCL_CULL_FACE_FRONT							0x00000404
+#define   NV20TCL_CULL_FACE_BACK							0x00000405
+#define   NV20TCL_CULL_FACE_FRONT_AND_BACK						0x00000408
+#define  NV20TCL_FRONT_FACE								0x000003a0
+#define   NV20TCL_FRONT_FACE_CW								0x00000900
+#define   NV20TCL_FRONT_FACE_CCW							0x00000901
+#define  NV20TCL_NORMALIZE_ENABLE							0x000003a4
+#define  NV20TCL_SEPARATE_SPECULAR_ENABLE						0x000003b8
+#define  NV20TCL_ENABLED_LIGHTS								0x000003bc
+#define  NV20TCL_CLIP_PLANE_ENABLE(x)							(0x000003c0+((x)*4))
+#define  NV20TCL_CLIP_PLANE_ENABLE__SIZE						0x00000010
+#define  NV20TCL_TX_MATRIX_ENABLE(x)							(0x00000420+((x)*4))
+#define  NV20TCL_TX_MATRIX_ENABLE__SIZE							0x00000004
+#define  NV20TCL_POINT_SIZE								0x0000043c
+#define  NV20TCL_MODELVIEW_MATRIX(x)							(0x00000480+((x)*4))
+#define  NV20TCL_MODELVIEW_MATRIX__SIZE							0x00000010
+#define  NV20TCL_PROJECTION_MATRIX(x)							(0x00000680+((x)*4))
+#define  NV20TCL_PROJECTION_MATRIX__SIZE						0x00000010
+#define  NV20TCL_TX0_MATRIX(x)								(0x000006c0+((x)*4))
+#define  NV20TCL_TX0_MATRIX__SIZE							0x00000010
+#define  NV20TCL_TX1_MATRIX(x)								(0x00000700+((x)*4))
+#define  NV20TCL_TX1_MATRIX__SIZE							0x00000010
+#define  NV20TCL_TX2_MATRIX(x)								(0x00000740+((x)*4))
+#define  NV20TCL_TX2_MATRIX__SIZE							0x00000010
+#define  NV20TCL_TX3_MATRIX(x)								(0x00000780+((x)*4))
+#define  NV20TCL_TX3_MATRIX__SIZE							0x00000010
+#define  NV20TCL_FOG_EQUATION_CONSTANT							0x000009c0
+#define  NV20TCL_FOG_EQUATION_LINEAR							0x000009c4
+#define  NV20TCL_FOG_EQUATION_QUADRATIC							0x000009c8
+#define  NV20TCL_VIEWPORT_SCALE0_X							0x00000a20
+#define  NV20TCL_VIEWPORT_SCALE0_Y							0x00000a24
+#define  NV20TCL_VIEWPORT_SCALE0_Z							0x00000a28
+#define  NV20TCL_VIEWPORT_SCALE0_W							0x00000a2c
+#define  NV20TCL_POINT_PARAMETER(x)							(0x00000a30+((x)*4))
+#define  NV20TCL_POINT_PARAMETER__SIZE							0x00000007
+#define  NV20TCL_RC_CONSTANT_COLOR0(x)							(0x00000a60+((x)*4))
+#define  NV20TCL_RC_CONSTANT_COLOR0__SIZE						0x00000008
+#define  NV20TCL_RC_CONSTANT_COLOR1(x)							(0x00000a80+((x)*4))
+#define  NV20TCL_RC_CONSTANT_COLOR1__SIZE						0x00000008
+#define  NV20TCL_RC_OUT_ALPHA(x)							(0x00000aa0+((x)*4))
+#define  NV20TCL_RC_OUT_ALPHA__SIZE							0x00000008
+#define  NV20TCL_RC_IN_RGB(x)								(0x00000ac0+((x)*4))
+#define  NV20TCL_RC_IN_RGB__SIZE							0x00000008
+#define  NV20TCL_VIEWPORT_SCALE1_X							0x00000af0
+#define  NV20TCL_VIEWPORT_SCALE1_Y							0x00000af4
+#define  NV20TCL_VIEWPORT_SCALE1_Z							0x00000af8
+#define  NV20TCL_VIEWPORT_SCALE1_W							0x00000afc
+#define  NV20TCL_VP_UPLOAD_INST(x)							(0x00000b00+((x)*4))
+#define  NV20TCL_VP_UPLOAD_INST__SIZE							0x00000004
+#define  NV20TCL_VP_UPLOAD_CONST(x)							(0x00000b80+((x)*4))
+#define  NV20TCL_VP_UPLOAD_CONST__SIZE							0x00000004
+#define  NV20TCL_POLYGON_STIPPLE_ENABLE							0x0000147c
+#define  NV20TCL_POLYGON_STIPPLE_PATTERN(x)						(0x00001480+((x)*4))
+#define  NV20TCL_POLYGON_STIPPLE_PATTERN__SIZE						0x00000020
+#define  NV20TCL_VERTEX_POS_3F_X							0x00001500
+#define  NV20TCL_VERTEX_POS_3F_Y							0x00001504
+#define  NV20TCL_VERTEX_POS_3F_Z							0x00001508
+#define  NV20TCL_VERTEX_POS_4F_X							0x00001518
+#define  NV20TCL_VERTEX_POS_4F_Y							0x0000151c
+#define  NV20TCL_VERTEX_POS_4F_Z							0x00001520
+#define  NV20TCL_VERTEX_POS_3I_XY							0x00001528
+#define   NV20TCL_VERTEX_POS_3I_XY_X_SHIFT						0
+#define   NV20TCL_VERTEX_POS_3I_XY_X_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_POS_3I_XY_Y_SHIFT						16
+#define   NV20TCL_VERTEX_POS_3I_XY_Y_MASK						0xffff0000
+#define  NV20TCL_VERTEX_POS_3I_Z							0x0000152c
+#define   NV20TCL_VERTEX_POS_3I_Z_Z_SHIFT						0
+#define   NV20TCL_VERTEX_POS_3I_Z_Z_MASK						0x0000ffff
+#define  NV20TCL_VERTEX_NOR_3F_X							0x00001530
+#define  NV20TCL_VERTEX_NOR_3F_Y							0x00001534
+#define  NV20TCL_VERTEX_NOR_3F_Z							0x00001538
+#define  NV20TCL_VERTEX_NOR_3I_XY							0x00001540
+#define   NV20TCL_VERTEX_NOR_3I_XY_X_SHIFT						0
+#define   NV20TCL_VERTEX_NOR_3I_XY_X_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_NOR_3I_XY_Y_SHIFT						16
+#define   NV20TCL_VERTEX_NOR_3I_XY_Y_MASK						0xffff0000
+#define  NV20TCL_VERTEX_NOR_3I_Z							0x00001544
+#define   NV20TCL_VERTEX_NOR_3I_Z_Z_SHIFT						0
+#define   NV20TCL_VERTEX_NOR_3I_Z_Z_MASK						0x0000ffff
+#define  NV20TCL_VERTEX_COL_4F_X							0x00001550
+#define  NV20TCL_VERTEX_COL_4F_Y							0x00001554
+#define  NV20TCL_VERTEX_COL_4F_Z							0x00001558
+#define  NV20TCL_VERTEX_COL_4F_W							0x0000155c
+#define  NV20TCL_VERTEX_COL_3F_X							0x00001560
+#define  NV20TCL_VERTEX_COL_3F_Y							0x00001564
+#define  NV20TCL_VERTEX_COL_3F_Z							0x00001568
+#define  NV20TCL_VERTEX_COL_4I								0x0000156c
+#define   NV20TCL_VERTEX_COL_4I_R_SHIFT							0
+#define   NV20TCL_VERTEX_COL_4I_R_MASK							0x000000ff
+#define   NV20TCL_VERTEX_COL_4I_G_SHIFT							8
+#define   NV20TCL_VERTEX_COL_4I_G_MASK							0x0000ff00
+#define   NV20TCL_VERTEX_COL_4I_B_SHIFT							16
+#define   NV20TCL_VERTEX_COL_4I_B_MASK							0x00ff0000
+#define   NV20TCL_VERTEX_COL_4I_A_SHIFT							24
+#define   NV20TCL_VERTEX_COL_4I_A_MASK							0xff000000
+#define  NV20TCL_VERTEX_COL2_3F_X							0x00001580
+#define  NV20TCL_VERTEX_COL2_3F_Y							0x00001584
+#define  NV20TCL_VERTEX_COL2_3F_Z							0x00001588
+#define  NV20TCL_VERTEX_COL2_4I								0x0000158c
+#define   NV20TCL_VERTEX_COL2_4I_R_SHIFT						0
+#define   NV20TCL_VERTEX_COL2_4I_R_MASK							0x000000ff
+#define   NV20TCL_VERTEX_COL2_4I_G_SHIFT						8
+#define   NV20TCL_VERTEX_COL2_4I_G_MASK							0x0000ff00
+#define   NV20TCL_VERTEX_COL2_4I_B_SHIFT						16
+#define   NV20TCL_VERTEX_COL2_4I_B_MASK							0x00ff0000
+#define   NV20TCL_VERTEX_COL2_4I_A_SHIFT						24
+#define   NV20TCL_VERTEX_COL2_4I_A_MASK							0xff000000
+#define  NV20TCL_VERTEX_TX0_2F_S							0x00001590
+#define  NV20TCL_VERTEX_TX0_2F_T							0x00001594
+#define  NV20TCL_VERTEX_TX0_2I								0x00001598
+#define   NV20TCL_VERTEX_TX0_2I_S_SHIFT							0
+#define   NV20TCL_VERTEX_TX0_2I_S_MASK							0x0000ffff
+#define   NV20TCL_VERTEX_TX0_2I_T_SHIFT							16
+#define   NV20TCL_VERTEX_TX0_2I_T_MASK							0xffff0000
+#define  NV20TCL_VERTEX_TX0_4F_S							0x000015a0
+#define  NV20TCL_VERTEX_TX0_4F_T							0x000015a4
+#define  NV20TCL_VERTEX_TX0_4F_R							0x000015a8
+#define  NV20TCL_VERTEX_TX0_4F_Q							0x000015ac
+#define  NV20TCL_VERTEX_TX0_4I_ST							0x000015b0
+#define   NV20TCL_VERTEX_TX0_4I_ST_S_SHIFT						0
+#define   NV20TCL_VERTEX_TX0_4I_ST_S_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_TX0_4I_ST_T_SHIFT						16
+#define   NV20TCL_VERTEX_TX0_4I_ST_T_MASK						0xffff0000
+#define  NV20TCL_VERTEX_TX0_4I_RQ							0x000015b4
+#define   NV20TCL_VERTEX_TX0_4I_RQ_R_SHIFT						0
+#define   NV20TCL_VERTEX_TX0_4I_RQ_R_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_TX0_4I_RQ_Q_SHIFT						16
+#define   NV20TCL_VERTEX_TX0_4I_RQ_Q_MASK						0xffff0000
+#define  NV20TCL_VERTEX_TX1_2F_S							0x000015b8
+#define  NV20TCL_VERTEX_TX1_2F_T							0x000015bc
+#define  NV20TCL_VERTEX_TX1_2I								0x000015c0
+#define   NV20TCL_VERTEX_TX1_2I_S_SHIFT							0
+#define   NV20TCL_VERTEX_TX1_2I_S_MASK							0x0000ffff
+#define   NV20TCL_VERTEX_TX1_2I_T_SHIFT							16
+#define   NV20TCL_VERTEX_TX1_2I_T_MASK							0xffff0000
+#define  NV20TCL_VERTEX_TX1_4F_S							0x000015c8
+#define  NV20TCL_VERTEX_TX1_4F_T							0x000015cc
+#define  NV20TCL_VERTEX_TX1_4F_R							0x000015d0
+#define  NV20TCL_VERTEX_TX1_4F_Q							0x000015d4
+#define  NV20TCL_VERTEX_TX1_4I_ST							0x000015d8
+#define   NV20TCL_VERTEX_TX1_4I_ST_S_SHIFT						0
+#define   NV20TCL_VERTEX_TX1_4I_ST_S_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_TX1_4I_ST_T_SHIFT						16
+#define   NV20TCL_VERTEX_TX1_4I_ST_T_MASK						0xffff0000
+#define  NV20TCL_VERTEX_TX1_4I_RQ							0x000015dc
+#define   NV20TCL_VERTEX_TX1_4I_RQ_R_SHIFT						0
+#define   NV20TCL_VERTEX_TX1_4I_RQ_R_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_TX1_4I_RQ_Q_SHIFT						16
+#define   NV20TCL_VERTEX_TX1_4I_RQ_Q_MASK						0xffff0000
+#define  NV20TCL_VERTEX_TX2_2F_S							0x000015e0
+#define  NV20TCL_VERTEX_TX2_2F_T							0x000015e4
+#define  NV20TCL_VERTEX_TX2_2I								0x000015e8
+#define   NV20TCL_VERTEX_TX2_2I_S_SHIFT							0
+#define   NV20TCL_VERTEX_TX2_2I_S_MASK							0x0000ffff
+#define   NV20TCL_VERTEX_TX2_2I_T_SHIFT							16
+#define   NV20TCL_VERTEX_TX2_2I_T_MASK							0xffff0000
+#define  NV20TCL_VERTEX_TX2_4F_S							0x000015f0
+#define  NV20TCL_VERTEX_TX2_4F_T							0x000015f4
+#define  NV20TCL_VERTEX_TX2_4F_R							0x000015f8
+#define  NV20TCL_VERTEX_TX2_4F_Q							0x000015fc
+#define  NV20TCL_VERTEX_TX2_4I_ST							0x00001600
+#define   NV20TCL_VERTEX_TX2_4I_ST_S_SHIFT						0
+#define   NV20TCL_VERTEX_TX2_4I_ST_S_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_TX2_4I_ST_T_SHIFT						16
+#define   NV20TCL_VERTEX_TX2_4I_ST_T_MASK						0xffff0000
+#define  NV20TCL_VERTEX_TX2_4I_RQ							0x00001604
+#define   NV20TCL_VERTEX_TX2_4I_RQ_R_SHIFT						0
+#define   NV20TCL_VERTEX_TX2_4I_RQ_R_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_TX2_4I_RQ_Q_SHIFT						16
+#define   NV20TCL_VERTEX_TX2_4I_RQ_Q_MASK						0xffff0000
+#define  NV20TCL_VERTEX_TX3_2F_S							0x00001608
+#define  NV20TCL_VERTEX_TX3_2F_T							0x0000160c
+#define  NV20TCL_VERTEX_TX3_2I								0x00001610
+#define   NV20TCL_VERTEX_TX3_2I_S_SHIFT							0
+#define   NV20TCL_VERTEX_TX3_2I_S_MASK							0x0000ffff
+#define   NV20TCL_VERTEX_TX3_2I_T_SHIFT							16
+#define   NV20TCL_VERTEX_TX3_2I_T_MASK							0xffff0000
+#define  NV20TCL_VERTEX_TX3_4F_S							0x00001620
+#define  NV20TCL_VERTEX_TX3_4F_T							0x00001624
+#define  NV20TCL_VERTEX_TX3_4F_R							0x00001628
+#define  NV20TCL_VERTEX_TX3_4F_Q							0x0000162c
+#define  NV20TCL_VERTEX_TX3_4I_ST							0x00001630
+#define   NV20TCL_VERTEX_TX3_4I_ST_S_SHIFT						0
+#define   NV20TCL_VERTEX_TX3_4I_ST_S_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_TX3_4I_ST_T_SHIFT						16
+#define   NV20TCL_VERTEX_TX3_4I_ST_T_MASK						0xffff0000
+#define  NV20TCL_VERTEX_TX3_4I_RQ							0x00001634
+#define   NV20TCL_VERTEX_TX3_4I_RQ_R_SHIFT						0
+#define   NV20TCL_VERTEX_TX3_4I_RQ_R_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_TX3_4I_RQ_Q_SHIFT						16
+#define   NV20TCL_VERTEX_TX3_4I_RQ_Q_MASK						0xffff0000
+#define  NV20TCL_EDGEFLAG_ENABLE							0x000016bc
+#define  NV20TCL_VERTEX_ATTR_OFFSET(x)							(0x00001720+((x)*4))
+#define  NV20TCL_VERTEX_ATTR_OFFSET__SIZE						0x00000010
+#define  NV20TCL_VERTEX_ARRAY_FORMAT(x)							(0x00001760+((x)*4))
+#define  NV20TCL_VERTEX_ARRAY_FORMAT__SIZE						0x00000010
+#define   NV20TCL_VERTEX_ARRAY_FORMAT_TYPE_SHIFT					0
+#define   NV20TCL_VERTEX_ARRAY_FORMAT_TYPE_MASK						0x0000000f
+#define   NV20TCL_VERTEX_ARRAY_FORMAT_FIELDS_SHIFT					4
+#define   NV20TCL_VERTEX_ARRAY_FORMAT_FIELDS_MASK					0x000000f0
+#define   NV20TCL_VERTEX_ARRAY_FORMAT_STRIDE_SHIFT					8
+#define   NV20TCL_VERTEX_ARRAY_FORMAT_STRIDE_MASK					0x0000ff00
+#define  NV20TCL_COLOR_LOGIC_OP_ENABLE							0x000017bc
+#define  NV20TCL_COLOR_LOGIC_OP_OP							0x000017c0
+#define  NV20TCL_LIGHT_MODEL_TWO_SIDE_ENABLE						0x000017c4
+#define  NV20TCL_TX_SHADER_CULL_MODE							0x000017f8
+#define  NV20TCL_VERTEX_BEGIN_END							0x000017fc
+#define  NV20TCL_VERTEX_BUFFER_DRAW_ARRAYS						0x00001810
+#define  NV20TCL_VERTEX_ARRAY_DATA							0x00001818
+#define  NV20TCL_TX_OFFSET(x)								(0x00001b00+((x)*64))
+#define  NV20TCL_TX_OFFSET__SIZE							0x00000004
+#define  NV20TCL_TX_FORMAT(x)								(0x00001b04+((x)*64))
+#define  NV20TCL_TX_FORMAT__SIZE							0x00000004
+#define  NV20TCL_TX_WRAP(x)								(0x00001b08+((x)*64))
+#define  NV20TCL_TX_WRAP__SIZE								0x00000004
+#define  NV20TCL_TX_ENABLE(x)								(0x00001b0c+((x)*64))
+#define  NV20TCL_TX_ENABLE__SIZE							0x00000004
+#define  NV20TCL_TX_FILTER(x)								(0x00001b14+((x)*64))
+#define  NV20TCL_TX_FILTER__SIZE							0x00000004
+#define  NV20TCL_TX_BORDER_COLOR(x)							(0x00001b24+((x)*64))
+#define  NV20TCL_TX_BORDER_COLOR__SIZE							0x00000004
+#define  NV20TCL_SCISSOR_HORIZ								0x00001c30
+#define  NV20TCL_SCISSOR_VERT								0x00001c50
+#define  NV20TCL_MULTISAMPLE_CONTROL							0x00001d7c
+#define  NV20TCL_CLEAR_VALUE								0x00001d90
+#define  NV20TCL_CLEAR_BUFFERS								0x00001d94
+#define  NV20TCL_RC_COLOR0								0x00001e20
+#define  NV20TCL_RC_COLOR1								0x00001e24
+#define  NV20TCL_RC_OUT_RGB(x)								(0x00001e40+((x)*4))
+#define  NV20TCL_RC_OUT_RGB__SIZE							0x00000008
+#define  NV20TCL_RC_ENABLE								0x00001e60
+#define  NV20TCL_TX_SHADER_OP								0x00001e70
+#define  NV20TCL_VP_UPLOAD_CONST_ID							0x00001ea4
+
+
+#define NV17TCL										0x00000099
+
+#define  NV17TCL_DMA_IN_MEMORY4								0x000001ac
+#define  NV17TCL_DMA_IN_MEMORY5								0x000001b0
+#define  NV17TCL_COLOR_MASK_ENABLE							0x000002bc
+#define  NV17TCL_LMA_DEPTH_BUFFER_PITCH							0x00000d5c
+#define  NV17TCL_LMA_DEPTH_BUFFER_OFFSET						0x00000d60
+#define  NV17TCL_LMA_DEPTH_FILL_VALUE							0x00000d68
+#define  NV17TCL_LMA_DEPTH_BUFFER_CLEAR							0x00000d6c
+#define  NV17TCL_LMA_DEPTH_ENABLE							0x00001658
+
+
+#define NV20_SWIZZLED_SURFACE								0x0000009e
+
+
+
+#define NV12_IMAGE_BLIT									0x0000009f
+
+
+
+#define NV30_CONTEXT_SURFACES_2D							0x00000362
+
+
+
+#define NV30_STRETCHED_IMAGE_FROM_CPU							0x00000366
+
+
+
+#define NV30_TEXTURE_FROM_CPU								0x0000037b
+
+
+
+#define NV30_SCALED_IMAGE_FROM_MEMORY							0x00000389
+
+
+
+#define NV30_IMAGE_FROM_CPU								0x0000038a
+
+
+
+#define NV30TCL										0x00000397
+
+
+
+#define NV30_SWIZZLED_SURFACE								0x0000039e
+
+
+
+#define NV35TCL										0x00000497
+
+
+
+#define NV25TCL										0x00000597
+
+#define  NV25TCL_DMA_IN_MEMORY4								0x0000019c
+#define  NV25TCL_DMA_IN_MEMORY5								0x000001a0
+#define  NV25TCL_DMA_IN_MEMORY8								0x000001ac
+#define  NV25TCL_DMA_IN_MEMORY9								0x000001b0
+
+
+#define NV34TCL										0x00000697
+
+#define  NV34TCL_NOP									0x00000100
+#define  NV34TCL_NOTIFY									0x00000104
+#define  NV34TCL_DMA_NOTIFY								0x00000180
+#define  NV34TCL_DMA_TEXTURE0								0x00000184
+#define  NV34TCL_DMA_TEXTURE1								0x00000188
+#define  NV34TCL_DMA_COLOR1								0x0000018c
+#define  NV34TCL_DMA_COLOR0								0x00000194
+#define  NV34TCL_DMA_ZETA								0x00000198
+#define  NV34TCL_DMA_VTXBUF0								0x0000019c
+#define  NV34TCL_DMA_VTXBUF1								0x000001a0
+#define  NV34TCL_DMA_FENCE								0x000001a4
+#define  NV34TCL_DMA_QUERY								0x000001a8
+#define  NV34TCL_DMA_IN_MEMORY7								0x000001ac
+#define  NV34TCL_DMA_IN_MEMORY8								0x000001b0
+#define  NV34TCL_RT_HORIZ								0x00000200
+#define   NV34TCL_RT_HORIZ_X_SHIFT							0
+#define   NV34TCL_RT_HORIZ_X_MASK							0x0000ffff
+#define   NV34TCL_RT_HORIZ_W_SHIFT							16
+#define   NV34TCL_RT_HORIZ_W_MASK							0xffff0000
+#define  NV34TCL_RT_VERT								0x00000204
+#define   NV34TCL_RT_VERT_Y_SHIFT							0
+#define   NV34TCL_RT_VERT_Y_MASK							0x0000ffff
+#define   NV34TCL_RT_VERT_H_SHIFT							16
+#define   NV34TCL_RT_VERT_H_MASK							0xffff0000
+#define  NV34TCL_RT_FORMAT								0x00000208
+#define   NV34TCL_RT_FORMAT_TYPE_SHIFT							8
+#define   NV34TCL_RT_FORMAT_TYPE_MASK							0x00000f00
+#define    NV34TCL_RT_FORMAT_TYPE_LINEAR						0x00000100
+#define    NV34TCL_RT_FORMAT_TYPE_SWIZZLED						0x00000200
+#define   NV34TCL_RT_FORMAT_ZETA_SHIFT							5
+#define   NV34TCL_RT_FORMAT_ZETA_MASK							0x000000e0
+#define    NV34TCL_RT_FORMAT_ZETA_Z16							0x00000020
+#define    NV34TCL_RT_FORMAT_ZETA_Z24S8							0x00000040
+#define   NV34TCL_RT_FORMAT_COLOR_SHIFT							0
+#define   NV34TCL_RT_FORMAT_COLOR_MASK							0x0000001f
+#define    NV34TCL_RT_FORMAT_COLOR_R5G6B5						0x00000003
+#define    NV34TCL_RT_FORMAT_COLOR_X8R8G8B8						0x00000005
+#define    NV34TCL_RT_FORMAT_COLOR_A8R8G8B8						0x00000008
+#define    NV34TCL_RT_FORMAT_COLOR_B8							0x00000009
+#define    NV34TCL_RT_FORMAT_COLOR_X8B8G8R8						0x0000000f
+#define    NV34TCL_RT_FORMAT_COLOR_A8B8G8R8						0x00000010
+#define  NV34TCL_COLOR0_PITCH								0x0000020c
+#define   NV34TCL_COLOR0_PITCH_COLOR0_SHIFT						0
+#define   NV34TCL_COLOR0_PITCH_COLOR0_MASK						0x0000ffff
+#define   NV34TCL_COLOR0_PITCH_ZETA_SHIFT						16
+#define   NV34TCL_COLOR0_PITCH_ZETA_MASK						0xffff0000
+#define  NV34TCL_COLOR0_OFFSET								0x00000210
+#define  NV34TCL_ZETA_OFFSET								0x00000214
+#define  NV34TCL_COLOR1_OFFSET								0x00000218
+#define  NV34TCL_COLOR1_PITCH								0x0000021c
+#define  NV34TCL_RT_ENABLE								0x00000220
+#define   NV34TCL_RT_ENABLE_MRT								(1 <<  4)
+#define   NV34TCL_RT_ENABLE_COLOR1							(1 <<  1)
+#define   NV34TCL_RT_ENABLE_COLOR0							(1 <<  0)
+#define  NV34TCL_LMA_DEPTH_PITCH							0x0000022c
+#define  NV34TCL_LMA_DEPTH_OFFSET							0x00000230
+#define  NV34TCL_TX_UNITS_ENABLE							0x0000023c
+#define   NV34TCL_TX_UNITS_ENABLE_TX0							(1 <<  0)
+#define   NV34TCL_TX_UNITS_ENABLE_TX1							(1 <<  1)
+#define   NV34TCL_TX_UNITS_ENABLE_TX2							(1 <<  2)
+#define   NV34TCL_TX_UNITS_ENABLE_TX3							(1 <<  3)
+#define  NV34TCL_TX_MATRIX_ENABLE(x)							(0x00000240+((x)*4))
+#define  NV34TCL_TX_MATRIX_ENABLE__SIZE							0x00000004
+#define  NV34TCL_UNK0250(x)								(0x00000250+((x)*4))
+#define  NV34TCL_UNK0250__SIZE								0x00000004
+#define  NV34TCL_VIEWPORT_TX_ORIGIN							0x000002b8
+#define   NV34TCL_VIEWPORT_TX_ORIGIN_X_SHIFT						0
+#define   NV34TCL_VIEWPORT_TX_ORIGIN_X_MASK						0x0000ffff
+#define   NV34TCL_VIEWPORT_TX_ORIGIN_Y_SHIFT						16
+#define   NV34TCL_VIEWPORT_TX_ORIGIN_Y_MASK						0xffff0000
+#define  NV34TCL_VIEWPORT_CLIP_HORIZ(x)							(0x000002c0+((x)*8))
+#define  NV34TCL_VIEWPORT_CLIP_HORIZ__SIZE						0x00000008
+#define   NV34TCL_VIEWPORT_CLIP_HORIZ_L_SHIFT						0
+#define   NV34TCL_VIEWPORT_CLIP_HORIZ_L_MASK						0x0000ffff
+#define   NV34TCL_VIEWPORT_CLIP_HORIZ_R_SHIFT						16
+#define   NV34TCL_VIEWPORT_CLIP_HORIZ_R_MASK						0xffff0000
+#define  NV34TCL_VIEWPORT_CLIP_VERT(x)							(0x000002c4+((x)*8))
+#define  NV34TCL_VIEWPORT_CLIP_VERT__SIZE						0x00000008
+#define   NV34TCL_VIEWPORT_CLIP_VERT_T_SHIFT						0
+#define   NV34TCL_VIEWPORT_CLIP_VERT_T_MASK						0x0000ffff
+#define   NV34TCL_VIEWPORT_CLIP_VERT_D_SHIFT						16
+#define   NV34TCL_VIEWPORT_CLIP_VERT_D_MASK						0xffff0000
+#define  NV34TCL_DITHER_ENABLE								0x00000300
+#define  NV34TCL_ALPHA_FUNC_ENABLE							0x00000304
+#define  NV34TCL_ALPHA_FUNC_FUNC							0x00000308
+#define   NV34TCL_ALPHA_FUNC_FUNC_NEVER							0x00000200
+#define   NV34TCL_ALPHA_FUNC_FUNC_LESS							0x00000201
+#define   NV34TCL_ALPHA_FUNC_FUNC_EQUAL							0x00000202
+#define   NV34TCL_ALPHA_FUNC_FUNC_LEQUAL						0x00000203
+#define   NV34TCL_ALPHA_FUNC_FUNC_GREATER						0x00000204
+#define   NV34TCL_ALPHA_FUNC_FUNC_GREATER						0x00000204
+#define   NV34TCL_ALPHA_FUNC_FUNC_NOTEQUAL						0x00000205
+#define   NV34TCL_ALPHA_FUNC_FUNC_GEQUAL						0x00000206
+#define   NV34TCL_ALPHA_FUNC_FUNC_ALWAYS						0x00000207
+#define  NV34TCL_ALPHA_FUNC_REF								0x0000030c
+#define  NV34TCL_BLEND_FUNC_ENABLE							0x00000310
+#define  NV34TCL_BLEND_FUNC_SRC								0x00000314
+#define   NV34TCL_BLEND_FUNC_SRC_RGB_SHIFT						0
+#define   NV34TCL_BLEND_FUNC_SRC_RGB_MASK						0x0000ffff
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_ZERO						0x00000000
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_ONE						0x00000001
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_SRC_COLOR						0x00000300
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_COLOR				0x00000301
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA						0x00000302
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_ALPHA				0x00000303
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_DST_ALPHA						0x00000304
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_ALPHA				0x00000305
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_DST_COLOR						0x00000306
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_COLOR				0x00000307
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA_SATURATE				0x00000308
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_CONSTANT_COLOR					0x00008001
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_CONSTANT_ALPHA					0x00008003
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define   NV34TCL_BLEND_FUNC_SRC_ALPHA_SHIFT						16
+#define   NV34TCL_BLEND_FUNC_SRC_ALPHA_MASK						0xffff0000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_ZERO						0x00000000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_ONE						0x00010000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_SRC_COLOR					0x03000000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_COLOR				0x03010000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA					0x03020000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_ALPHA				0x03030000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_DST_ALPHA					0x03040000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_ALPHA				0x03050000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_DST_COLOR					0x03060000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_COLOR				0x03070000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA_SATURATE				0x03080000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_COLOR					0x80010000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_COLOR			0x80020000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_ALPHA					0x80030000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_ALPHA			0x80040000
+#define  NV34TCL_BLEND_FUNC_DST								0x00000318
+#define   NV34TCL_BLEND_FUNC_DST_RGB_SHIFT						0
+#define   NV34TCL_BLEND_FUNC_DST_RGB_MASK						0x0000ffff
+#define    NV34TCL_BLEND_FUNC_DST_RGB_ZERO						0x00000000
+#define    NV34TCL_BLEND_FUNC_DST_RGB_ONE						0x00000001
+#define    NV34TCL_BLEND_FUNC_DST_RGB_SRC_COLOR						0x00000300
+#define    NV34TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_COLOR				0x00000301
+#define    NV34TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA						0x00000302
+#define    NV34TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_ALPHA				0x00000303
+#define    NV34TCL_BLEND_FUNC_DST_RGB_DST_ALPHA						0x00000304
+#define    NV34TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_ALPHA				0x00000305
+#define    NV34TCL_BLEND_FUNC_DST_RGB_DST_COLOR						0x00000306
+#define    NV34TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_COLOR				0x00000307
+#define    NV34TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA_SATURATE				0x00000308
+#define    NV34TCL_BLEND_FUNC_DST_RGB_CONSTANT_COLOR					0x00008001
+#define    NV34TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define    NV34TCL_BLEND_FUNC_DST_RGB_CONSTANT_ALPHA					0x00008003
+#define    NV34TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define   NV34TCL_BLEND_FUNC_DST_ALPHA_SHIFT						16
+#define   NV34TCL_BLEND_FUNC_DST_ALPHA_MASK						0xffff0000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_ZERO						0x00000000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_ONE						0x00010000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_SRC_COLOR					0x03000000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_COLOR				0x03010000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA					0x03020000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_ALPHA				0x03030000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_DST_ALPHA					0x03040000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_ALPHA				0x03050000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_DST_COLOR					0x03060000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_COLOR				0x03070000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA_SATURATE				0x03080000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_COLOR					0x80010000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_COLOR			0x80020000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_ALPHA					0x80030000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_ALPHA			0x80040000
+#define  NV34TCL_BLEND_COLOR								0x0000031c
+#define   NV34TCL_BLEND_COLOR_B_SHIFT							0
+#define   NV34TCL_BLEND_COLOR_B_MASK							0x000000ff
+#define   NV34TCL_BLEND_COLOR_G_SHIFT							8
+#define   NV34TCL_BLEND_COLOR_G_MASK							0x0000ff00
+#define   NV34TCL_BLEND_COLOR_R_SHIFT							16
+#define   NV34TCL_BLEND_COLOR_R_MASK							0x00ff0000
+#define   NV34TCL_BLEND_COLOR_A_SHIFT							24
+#define   NV34TCL_BLEND_COLOR_A_MASK							0xff000000
+#define  NV34TCL_BLEND_EQUATION								0x00000320
+#define   NV34TCL_BLEND_EQUATION_FUNC_ADD						0x00008006
+#define   NV34TCL_BLEND_EQUATION_MIN							0x00008007
+#define   NV34TCL_BLEND_EQUATION_MAX							0x00008008
+#define   NV34TCL_BLEND_EQUATION_FUNC_SUBTRACT						0x0000800a
+#define   NV34TCL_BLEND_EQUATION_FUNC_REVERSE_SUBTRACT					0x0000800b
+#define  NV34TCL_COLOR_MASK								0x00000324
+#define   NV34TCL_COLOR_MASK_B_SHIFT							0
+#define   NV34TCL_COLOR_MASK_B_MASK							0x000000ff
+#define   NV34TCL_COLOR_MASK_G_SHIFT							8
+#define   NV34TCL_COLOR_MASK_G_MASK							0x0000ff00
+#define   NV34TCL_COLOR_MASK_R_SHIFT							16
+#define   NV34TCL_COLOR_MASK_R_MASK							0x00ff0000
+#define   NV34TCL_COLOR_MASK_A_SHIFT							24
+#define   NV34TCL_COLOR_MASK_A_MASK							0xff000000
+#define  NV34TCL_STENCIL_BACK_ENABLE							0x00000328
+#define  NV34TCL_STENCIL_BACK_MASK							0x0000032c
+#define  NV34TCL_STENCIL_BACK_FUNC_FUNC							0x00000330
+#define   NV34TCL_STENCIL_BACK_FUNC_FUNC_NEVER						0x00000200
+#define   NV34TCL_STENCIL_BACK_FUNC_FUNC_LESS						0x00000201
+#define   NV34TCL_STENCIL_BACK_FUNC_FUNC_EQUAL						0x00000202
+#define   NV34TCL_STENCIL_BACK_FUNC_FUNC_LEQUAL						0x00000203
+#define   NV34TCL_STENCIL_BACK_FUNC_FUNC_GREATER					0x00000204
+#define   NV34TCL_STENCIL_BACK_FUNC_FUNC_GREATER					0x00000204
+#define   NV34TCL_STENCIL_BACK_FUNC_FUNC_NOTEQUAL					0x00000205
+#define   NV34TCL_STENCIL_BACK_FUNC_FUNC_GEQUAL						0x00000206
+#define   NV34TCL_STENCIL_BACK_FUNC_FUNC_ALWAYS						0x00000207
+#define  NV34TCL_STENCIL_BACK_FUNC_REF							0x00000334
+#define  NV34TCL_STENCIL_BACK_FUNC_MASK							0x00000338
+#define  NV34TCL_STENCIL_BACK_OP_FAIL							0x0000033c
+#define   NV34TCL_STENCIL_BACK_OP_FAIL_ZERO						0x00000000
+#define   NV34TCL_STENCIL_BACK_OP_FAIL_INVERT						0x0000150a
+#define   NV34TCL_STENCIL_BACK_OP_FAIL_KEEP						0x00001e00
+#define   NV34TCL_STENCIL_BACK_OP_FAIL_REPLACE						0x00001e01
+#define   NV34TCL_STENCIL_BACK_OP_FAIL_INCR						0x00001e02
+#define   NV34TCL_STENCIL_BACK_OP_FAIL_DECR						0x00001e03
+#define   NV34TCL_STENCIL_BACK_OP_FAIL_INCR_WRAP					0x00008507
+#define   NV34TCL_STENCIL_BACK_OP_FAIL_DECR_WRAP					0x00008508
+#define  NV34TCL_STENCIL_BACK_OP_ZFAIL							0x00000340
+#define   NV34TCL_STENCIL_BACK_OP_ZFAIL_ZERO						0x00000000
+#define   NV34TCL_STENCIL_BACK_OP_ZFAIL_INVERT						0x0000150a
+#define   NV34TCL_STENCIL_BACK_OP_ZFAIL_KEEP						0x00001e00
+#define   NV34TCL_STENCIL_BACK_OP_ZFAIL_REPLACE						0x00001e01
+#define   NV34TCL_STENCIL_BACK_OP_ZFAIL_INCR						0x00001e02
+#define   NV34TCL_STENCIL_BACK_OP_ZFAIL_DECR						0x00001e03
+#define   NV34TCL_STENCIL_BACK_OP_ZFAIL_INCR_WRAP					0x00008507
+#define   NV34TCL_STENCIL_BACK_OP_ZFAIL_DECR_WRAP					0x00008508
+#define  NV34TCL_STENCIL_BACK_OP_ZPASS							0x00000344
+#define   NV34TCL_STENCIL_BACK_OP_ZPASS_ZERO						0x00000000
+#define   NV34TCL_STENCIL_BACK_OP_ZPASS_INVERT						0x0000150a
+#define   NV34TCL_STENCIL_BACK_OP_ZPASS_KEEP						0x00001e00
+#define   NV34TCL_STENCIL_BACK_OP_ZPASS_REPLACE						0x00001e01
+#define   NV34TCL_STENCIL_BACK_OP_ZPASS_INCR						0x00001e02
+#define   NV34TCL_STENCIL_BACK_OP_ZPASS_DECR						0x00001e03
+#define   NV34TCL_STENCIL_BACK_OP_ZPASS_INCR_WRAP					0x00008507
+#define   NV34TCL_STENCIL_BACK_OP_ZPASS_DECR_WRAP					0x00008508
+#define  NV34TCL_STENCIL_FRONT_ENABLE							0x00000348
+#define  NV34TCL_STENCIL_FRONT_MASK							0x0000034c
+#define  NV34TCL_STENCIL_FRONT_FUNC_FUNC						0x00000350
+#define   NV34TCL_STENCIL_FRONT_FUNC_FUNC_NEVER						0x00000200
+#define   NV34TCL_STENCIL_FRONT_FUNC_FUNC_LESS						0x00000201
+#define   NV34TCL_STENCIL_FRONT_FUNC_FUNC_EQUAL						0x00000202
+#define   NV34TCL_STENCIL_FRONT_FUNC_FUNC_LEQUAL					0x00000203
+#define   NV34TCL_STENCIL_FRONT_FUNC_FUNC_GREATER					0x00000204
+#define   NV34TCL_STENCIL_FRONT_FUNC_FUNC_GREATER					0x00000204
+#define   NV34TCL_STENCIL_FRONT_FUNC_FUNC_NOTEQUAL					0x00000205
+#define   NV34TCL_STENCIL_FRONT_FUNC_FUNC_GEQUAL					0x00000206
+#define   NV34TCL_STENCIL_FRONT_FUNC_FUNC_ALWAYS					0x00000207
+#define  NV34TCL_STENCIL_FRONT_FUNC_REF							0x00000354
+#define  NV34TCL_STENCIL_FRONT_FUNC_MASK						0x00000358
+#define  NV34TCL_STENCIL_FRONT_OP_FAIL							0x0000035c
+#define   NV34TCL_STENCIL_FRONT_OP_FAIL_ZERO						0x00000000
+#define   NV34TCL_STENCIL_FRONT_OP_FAIL_INVERT						0x0000150a
+#define   NV34TCL_STENCIL_FRONT_OP_FAIL_KEEP						0x00001e00
+#define   NV34TCL_STENCIL_FRONT_OP_FAIL_REPLACE						0x00001e01
+#define   NV34TCL_STENCIL_FRONT_OP_FAIL_INCR						0x00001e02
+#define   NV34TCL_STENCIL_FRONT_OP_FAIL_DECR						0x00001e03
+#define   NV34TCL_STENCIL_FRONT_OP_FAIL_INCR_WRAP					0x00008507
+#define   NV34TCL_STENCIL_FRONT_OP_FAIL_DECR_WRAP					0x00008508
+#define  NV34TCL_STENCIL_FRONT_OP_ZFAIL							0x00000360
+#define   NV34TCL_STENCIL_FRONT_OP_ZFAIL_ZERO						0x00000000
+#define   NV34TCL_STENCIL_FRONT_OP_ZFAIL_INVERT						0x0000150a
+#define   NV34TCL_STENCIL_FRONT_OP_ZFAIL_KEEP						0x00001e00
+#define   NV34TCL_STENCIL_FRONT_OP_ZFAIL_REPLACE					0x00001e01
+#define   NV34TCL_STENCIL_FRONT_OP_ZFAIL_INCR						0x00001e02
+#define   NV34TCL_STENCIL_FRONT_OP_ZFAIL_DECR						0x00001e03
+#define   NV34TCL_STENCIL_FRONT_OP_ZFAIL_INCR_WRAP					0x00008507
+#define   NV34TCL_STENCIL_FRONT_OP_ZFAIL_DECR_WRAP					0x00008508
+#define  NV34TCL_STENCIL_FRONT_OP_ZPASS							0x00000364
+#define   NV34TCL_STENCIL_FRONT_OP_ZPASS_ZERO						0x00000000
+#define   NV34TCL_STENCIL_FRONT_OP_ZPASS_INVERT						0x0000150a
+#define   NV34TCL_STENCIL_FRONT_OP_ZPASS_KEEP						0x00001e00
+#define   NV34TCL_STENCIL_FRONT_OP_ZPASS_REPLACE					0x00001e01
+#define   NV34TCL_STENCIL_FRONT_OP_ZPASS_INCR						0x00001e02
+#define   NV34TCL_STENCIL_FRONT_OP_ZPASS_DECR						0x00001e03
+#define   NV34TCL_STENCIL_FRONT_OP_ZPASS_INCR_WRAP					0x00008507
+#define   NV34TCL_STENCIL_FRONT_OP_ZPASS_DECR_WRAP					0x00008508
+#define  NV34TCL_SHADE_MODEL								0x00000368
+#define   NV34TCL_SHADE_MODEL_FLAT							0x00001d00
+#define   NV34TCL_SHADE_MODEL_SMOOTH							0x00001d01
+#define  NV34TCL_FOG_ENABLE								0x0000036c
+#define  NV34TCL_FOG_COLOR								0x00000370
+#define  NV34TCL_COLOR_LOGIC_OP_ENABLE							0x00000374
+#define  NV34TCL_COLOR_LOGIC_OP_OP							0x00000378
+#define   NV34TCL_COLOR_LOGIC_OP_OP_CLEAR						0x00001500
+#define   NV34TCL_COLOR_LOGIC_OP_OP_AND							0x00001501
+#define   NV34TCL_COLOR_LOGIC_OP_OP_AND_REVERSE						0x00001502
+#define   NV34TCL_COLOR_LOGIC_OP_OP_COPY						0x00001503
+#define   NV34TCL_COLOR_LOGIC_OP_OP_AND_INVERTED					0x00001504
+#define   NV34TCL_COLOR_LOGIC_OP_OP_NOOP						0x00001505
+#define   NV34TCL_COLOR_LOGIC_OP_OP_XOR							0x00001506
+#define   NV34TCL_COLOR_LOGIC_OP_OP_OR							0x00001507
+#define   NV34TCL_COLOR_LOGIC_OP_OP_NOR							0x00001508
+#define   NV34TCL_COLOR_LOGIC_OP_OP_EQUIV						0x00001509
+#define   NV34TCL_COLOR_LOGIC_OP_OP_INVERT						0x0000150a
+#define   NV34TCL_COLOR_LOGIC_OP_OP_OR_REVERSE						0x0000150b
+#define   NV34TCL_COLOR_LOGIC_OP_OP_COPY_INVERTED					0x0000150c
+#define   NV34TCL_COLOR_LOGIC_OP_OP_OR_INVERTED						0x0000150d
+#define   NV34TCL_COLOR_LOGIC_OP_OP_NAND						0x0000150e
+#define   NV34TCL_COLOR_LOGIC_OP_OP_SET							0x0000150f
+#define  NV34TCL_NORMALIZE_ENABLE							0x0000037c
+#define  NV34TCL_COLOR_MATERIAL								0x00000390
+#define   NV34TCL_COLOR_MATERIAL_FRONT_EMISSION_ENABLE					(1 <<  0)
+#define   NV34TCL_COLOR_MATERIAL_FRONT_AMBIENT_ENABLE					(1 <<  2)
+#define   NV34TCL_COLOR_MATERIAL_FRONT_DIFFUSE_ENABLE					(1 <<  4)
+#define   NV34TCL_COLOR_MATERIAL_FRONT_SPECULAR_ENABLE					(1 <<  6)
+#define   NV34TCL_COLOR_MATERIAL_BACK_EMISSION_ENABLE					(1 <<  8)
+#define   NV34TCL_COLOR_MATERIAL_BACK_AMBIENT_ENABLE					(1 << 10)
+#define   NV34TCL_COLOR_MATERIAL_BACK_DIFFUSE_ENABLE					(1 << 12)
+#define   NV34TCL_COLOR_MATERIAL_BACK_SPECULAR_ENABLE					(1 << 14)
+#define  NV34TCL_DEPTH_RANGE_NEAR							0x00000394
+#define  NV34TCL_DEPTH_RANGE_FAR							0x00000398
+#define  NV34TCL_COLOR_MATERIAL_FRONT_R							0x000003a0
+#define  NV34TCL_COLOR_MATERIAL_FRONT_G							0x000003a4
+#define  NV34TCL_COLOR_MATERIAL_FRONT_B							0x000003a8
+#define  NV34TCL_COLOR_MATERIAL_FRONT_A							0x000003b4
+#define  NV34TCL_LINE_WIDTH								0x000003b8
+#define  NV34TCL_LINE_SMOOTH_ENABLE							0x000003bc
+#define  NV34TCL_CLIP_PLANE_ENABLE(x)							(0x00000400+((x)*4))
+#define  NV34TCL_CLIP_PLANE_ENABLE__SIZE						0x00000020
+#define   NV34TCL_CLIP_PLANE_ENABLE_FALSE						0x00000000
+#define   NV34TCL_CLIP_PLANE_ENABLE_EYE_LINEAR						0x00002400
+#define   NV34TCL_CLIP_PLANE_ENABLE_OBJECT_LINEAR					0x00002401
+#define  NV34TCL_MODELVIEW_MATRIX(x)							(0x00000480+((x)*4))
+#define  NV34TCL_MODELVIEW_MATRIX__SIZE							0x00000010
+#define  NV34TCL_INVERSE_MODELVIEW_MATRIX(x)						(0x00000580+((x)*4))
+#define  NV34TCL_INVERSE_MODELVIEW_MATRIX__SIZE						0x0000000c
+#define  NV34TCL_PROJECTION_MATRIX(x)							(0x00000680+((x)*4))
+#define  NV34TCL_PROJECTION_MATRIX__SIZE						0x00000010
+#define  NV34TCL_TX0_MATRIX(x)								(0x000006c0+((x)*4))
+#define  NV34TCL_TX0_MATRIX__SIZE							0x00000010
+#define  NV34TCL_TX1_MATRIX(x)								(0x00000700+((x)*4))
+#define  NV34TCL_TX1_MATRIX__SIZE							0x00000010
+#define  NV34TCL_TX2_MATRIX(x)								(0x00000740+((x)*4))
+#define  NV34TCL_TX2_MATRIX__SIZE							0x00000010
+#define  NV34TCL_TX3_MATRIX(x)								(0x00000780+((x)*4))
+#define  NV34TCL_TX3_MATRIX__SIZE							0x00000010
+#define  NV34TCL_SCISSOR_HORIZ								0x000008c0
+#define   NV34TCL_SCISSOR_HORIZ_X_SHIFT							0
+#define   NV34TCL_SCISSOR_HORIZ_X_MASK							0x0000ffff
+#define   NV34TCL_SCISSOR_HORIZ_W_SHIFT							16
+#define   NV34TCL_SCISSOR_HORIZ_W_MASK							0xffff0000
+#define  NV34TCL_SCISSOR_VERT								0x000008c4
+#define   NV34TCL_SCISSOR_VERT_Y_SHIFT							0
+#define   NV34TCL_SCISSOR_VERT_Y_MASK							0x0000ffff
+#define   NV34TCL_SCISSOR_VERT_H_SHIFT							16
+#define   NV34TCL_SCISSOR_VERT_H_MASK							0xffff0000
+#define  NV34TCL_FOG_COORD_DIST								0x000008c8
+#define  NV34TCL_FOG_MODE								0x000008cc
+#define   NV34TCL_FOG_MODE_EXP								0x00000800
+#define   NV34TCL_FOG_MODE_EXP_2							0x00000802
+#define   NV34TCL_FOG_MODE_EXP2								0x00000803
+#define   NV34TCL_FOG_MODE_LINEAR							0x00000804
+#define   NV34TCL_FOG_MODE_LINEAR_2							0x00002601
+#define  NV34TCL_FOG_EQUATION_CONSTANT							0x000008d0
+#define  NV34TCL_FOG_EQUATION_LINEAR							0x000008d4
+#define  NV34TCL_FOG_EQUATION_QUADRATIC							0x000008d8
+#define  NV34TCL_FP_ACTIVE_PROGRAM							0x000008e4
+#define   NV34TCL_FP_ACTIVE_PROGRAM_DMA0						(1 <<  0)
+#define   NV34TCL_FP_ACTIVE_PROGRAM_DMA1						(1 <<  1)
+#define   NV34TCL_FP_ACTIVE_PROGRAM_OFFSET_SHIFT					2
+#define   NV34TCL_FP_ACTIVE_PROGRAM_OFFSET_MASK						0xfffffffc
+#define  NV34TCL_RC_COLOR0								0x000008ec
+#define   NV34TCL_RC_COLOR0_B_SHIFT							0
+#define   NV34TCL_RC_COLOR0_B_MASK							0x000000ff
+#define   NV34TCL_RC_COLOR0_G_SHIFT							8
+#define   NV34TCL_RC_COLOR0_G_MASK							0x0000ff00
+#define   NV34TCL_RC_COLOR0_R_SHIFT							16
+#define   NV34TCL_RC_COLOR0_R_MASK							0x00ff0000
+#define   NV34TCL_RC_COLOR0_A_SHIFT							24
+#define   NV34TCL_RC_COLOR0_A_MASK							0xff000000
+#define  NV34TCL_RC_COLOR1								0x000008f0
+#define   NV34TCL_RC_COLOR1_B_SHIFT							0
+#define   NV34TCL_RC_COLOR1_B_MASK							0x000000ff
+#define   NV34TCL_RC_COLOR1_G_SHIFT							8
+#define   NV34TCL_RC_COLOR1_G_MASK							0x0000ff00
+#define   NV34TCL_RC_COLOR1_R_SHIFT							16
+#define   NV34TCL_RC_COLOR1_R_MASK							0x00ff0000
+#define   NV34TCL_RC_COLOR1_A_SHIFT							24
+#define   NV34TCL_RC_COLOR1_A_MASK							0xff000000
+#define  NV34TCL_RC_FINAL0								0x000008f4
+#define  NV34TCL_RC_FINAL1								0x000008f8
+#define  NV34TCL_RC_ENABLE								0x000008fc
+#define  NV34TCL_RC_IN_ALPHA(x)								(0x00000900+((x)*32))
+#define  NV34TCL_RC_IN_ALPHA__SIZE							0x00000008
+#define  NV34TCL_RC_IN_RGB(x)								(0x00000904+((x)*32))
+#define  NV34TCL_RC_IN_RGB__SIZE							0x00000008
+#define  NV34TCL_RC_CONSTANT_COLOR0(x)							(0x00000908+((x)*32))
+#define  NV34TCL_RC_CONSTANT_COLOR0__SIZE						0x00000008
+#define   NV34TCL_RC_CONSTANT_COLOR0_B_SHIFT						0
+#define   NV34TCL_RC_CONSTANT_COLOR0_B_MASK						0x000000ff
+#define   NV34TCL_RC_CONSTANT_COLOR0_G_SHIFT						8
+#define   NV34TCL_RC_CONSTANT_COLOR0_G_MASK						0x0000ff00
+#define   NV34TCL_RC_CONSTANT_COLOR0_R_SHIFT						16
+#define   NV34TCL_RC_CONSTANT_COLOR0_R_MASK						0x00ff0000
+#define   NV34TCL_RC_CONSTANT_COLOR0_A_SHIFT						24
+#define   NV34TCL_RC_CONSTANT_COLOR0_A_MASK						0xff000000
+#define  NV34TCL_RC_CONSTANT_COLOR1(x)							(0x0000090c+((x)*32))
+#define  NV34TCL_RC_CONSTANT_COLOR1__SIZE						0x00000008
+#define   NV34TCL_RC_CONSTANT_COLOR1_B_SHIFT						0
+#define   NV34TCL_RC_CONSTANT_COLOR1_B_MASK						0x000000ff
+#define   NV34TCL_RC_CONSTANT_COLOR1_G_SHIFT						8
+#define   NV34TCL_RC_CONSTANT_COLOR1_G_MASK						0x0000ff00
+#define   NV34TCL_RC_CONSTANT_COLOR1_R_SHIFT						16
+#define   NV34TCL_RC_CONSTANT_COLOR1_R_MASK						0x00ff0000
+#define   NV34TCL_RC_CONSTANT_COLOR1_A_SHIFT						24
+#define   NV34TCL_RC_CONSTANT_COLOR1_A_MASK						0xff000000
+#define  NV34TCL_RC_OUT_ALPHA(x)							(0x00000910+((x)*32))
+#define  NV34TCL_RC_OUT_ALPHA__SIZE							0x00000008
+#define  NV34TCL_RC_OUT_RGB(x)								(0x00000914+((x)*32))
+#define  NV34TCL_RC_OUT_RGB__SIZE							0x00000008
+#define  NV34TCL_VIEWPORT_HORIZ								0x00000a00
+#define   NV34TCL_VIEWPORT_HORIZ_X_SHIFT						0
+#define   NV34TCL_VIEWPORT_HORIZ_X_MASK							0x0000ffff
+#define   NV34TCL_VIEWPORT_HORIZ_W_SHIFT						16
+#define   NV34TCL_VIEWPORT_HORIZ_W_MASK							0xffff0000
+#define  NV34TCL_VIEWPORT_VERT								0x00000a04
+#define   NV34TCL_VIEWPORT_VERT_Y_SHIFT							0
+#define   NV34TCL_VIEWPORT_VERT_Y_MASK							0x0000ffff
+#define   NV34TCL_VIEWPORT_VERT_H_SHIFT							16
+#define   NV34TCL_VIEWPORT_VERT_H_MASK							0xffff0000
+#define  NV34TCL_LIGHT_MODEL_FRONT_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_R			0x00000a10
+#define  NV34TCL_LIGHT_MODEL_FRONT_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_G			0x00000a14
+#define  NV34TCL_LIGHT_MODEL_FRONT_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_B			0x00000a18
+#define  NV34TCL_VIEWPORT_TRANSLATE_X							0x00000a20
+#define  NV34TCL_VIEWPORT_TRANSLATE_Y							0x00000a24
+#define  NV34TCL_VIEWPORT_TRANSLATE_Z							0x00000a28
+#define  NV34TCL_VIEWPORT_TRANSLATE_W							0x00000a2c
+#define  NV34TCL_VIEWPORT_SCALE_X							0x00000a30
+#define  NV34TCL_VIEWPORT_SCALE_Y							0x00000a34
+#define  NV34TCL_VIEWPORT_SCALE_Z							0x00000a38
+#define  NV34TCL_VIEWPORT_SCALE_W							0x00000a3c
+#define  NV34TCL_POLYGON_OFFSET_POINT_ENABLE						0x00000a60
+#define  NV34TCL_POLYGON_OFFSET_LINE_ENABLE						0x00000a64
+#define  NV34TCL_POLYGON_OFFSET_FILL_ENABLE						0x00000a68
+#define  NV34TCL_DEPTH_FUNC								0x00000a6c
+#define   NV34TCL_DEPTH_FUNC_NEVER							0x00000200
+#define   NV34TCL_DEPTH_FUNC_LESS							0x00000201
+#define   NV34TCL_DEPTH_FUNC_EQUAL							0x00000202
+#define   NV34TCL_DEPTH_FUNC_LEQUAL							0x00000203
+#define   NV34TCL_DEPTH_FUNC_GREATER							0x00000204
+#define   NV34TCL_DEPTH_FUNC_GREATER							0x00000204
+#define   NV34TCL_DEPTH_FUNC_NOTEQUAL							0x00000205
+#define   NV34TCL_DEPTH_FUNC_GEQUAL							0x00000206
+#define   NV34TCL_DEPTH_FUNC_ALWAYS							0x00000207
+#define  NV34TCL_DEPTH_WRITE_ENABLE							0x00000a70
+#define  NV34TCL_DEPTH_TEST_ENABLE							0x00000a74
+#define  NV34TCL_POLYGON_OFFSET_FACTOR							0x00000a78
+#define  NV34TCL_POLYGON_OFFSET_UNITS							0x00000a7c
+#define  NV34TCL_VTX_ATTR_3I_XY(x)							(0x00000a80+((x)*8))
+#define  NV34TCL_VTX_ATTR_3I_XY__SIZE							0x00000010
+#define   NV34TCL_VTX_ATTR_3I_XY_X_SHIFT						0
+#define   NV34TCL_VTX_ATTR_3I_XY_X_MASK							0x0000ffff
+#define   NV34TCL_VTX_ATTR_3I_XY_Y_SHIFT						16
+#define   NV34TCL_VTX_ATTR_3I_XY_Y_MASK							0xffff0000
+#define  NV34TCL_VTX_ATTR_3I_Z(x)							(0x00000a84+((x)*8))
+#define  NV34TCL_VTX_ATTR_3I_Z__SIZE							0x00000010
+#define   NV34TCL_VTX_ATTR_3I_Z_Z_SHIFT							0
+#define   NV34TCL_VTX_ATTR_3I_Z_Z_MASK							0x0000ffff
+#define  NV34TCL_VP_UPLOAD_INST(x)							(0x00000b80+((x)*4))
+#define  NV34TCL_VP_UPLOAD_INST__SIZE							0x00000004
+#define  NV34TCL_CLIP_PLANE_A(x)							(0x00000e00+((x)*16))
+#define  NV34TCL_CLIP_PLANE_A__SIZE							0x00000020
+#define  NV34TCL_CLIP_PLANE_B(x)							(0x00000e04+((x)*16))
+#define  NV34TCL_CLIP_PLANE_B__SIZE							0x00000020
+#define  NV34TCL_CLIP_PLANE_C(x)							(0x00000e08+((x)*16))
+#define  NV34TCL_CLIP_PLANE_C__SIZE							0x00000020
+#define  NV34TCL_CLIP_PLANE_D(x)							(0x00000e0c+((x)*16))
+#define  NV34TCL_CLIP_PLANE_D__SIZE							0x00000020
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_R(x)					(0x00001000+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_R__SIZE				0x00000008
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_G(x)					(0x00001004+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_G__SIZE				0x00000008
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_B(x)					(0x00001008+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_B__SIZE				0x00000008
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_R(x)					(0x0000100c+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_R__SIZE				0x00000008
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_G(x)					(0x00001010+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_G__SIZE				0x00000008
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_B(x)					(0x00001014+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_B__SIZE				0x00000008
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_R(x)					(0x00001018+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_R__SIZE				0x00000008
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_G(x)					(0x0000101c+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_G__SIZE				0x00000008
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_B(x)					(0x00001020+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_B__SIZE				0x00000008
+#define  NV34TCL_LIGHT_HALF_VECTOR_X(x)							(0x00001028+((x)*64))
+#define  NV34TCL_LIGHT_HALF_VECTOR_X__SIZE						0x00000008
+#define  NV34TCL_LIGHT_HALF_VECTOR_Y(x)							(0x0000102c+((x)*64))
+#define  NV34TCL_LIGHT_HALF_VECTOR_Y__SIZE						0x00000008
+#define  NV34TCL_LIGHT_HALF_VECTOR_Z(x)							(0x00001030+((x)*64))
+#define  NV34TCL_LIGHT_HALF_VECTOR_Z__SIZE						0x00000008
+#define  NV34TCL_LIGHT_DIRECTION_X(x)							(0x00001034+((x)*64))
+#define  NV34TCL_LIGHT_DIRECTION_X__SIZE						0x00000008
+#define  NV34TCL_LIGHT_DIRECTION_Y(x)							(0x00001038+((x)*64))
+#define  NV34TCL_LIGHT_DIRECTION_Y__SIZE						0x00000008
+#define  NV34TCL_LIGHT_DIRECTION_Z(x)							(0x0000103c+((x)*64))
+#define  NV34TCL_LIGHT_DIRECTION_Z__SIZE						0x00000008
+#define  NV34TCL_LIGHT_SPOT_CUTOFF_A(x)							(0x00001200+((x)*64))
+#define  NV34TCL_LIGHT_SPOT_CUTOFF_A__SIZE						0x00000008
+#define  NV34TCL_LIGHT_SPOT_CUTOFF_B(x)							(0x00001204+((x)*64))
+#define  NV34TCL_LIGHT_SPOT_CUTOFF_B__SIZE						0x00000008
+#define  NV34TCL_LIGHT_SPOT_CUTOFF_C(x)							(0x00001208+((x)*64))
+#define  NV34TCL_LIGHT_SPOT_CUTOFF_C__SIZE						0x00000008
+#define  NV34TCL_LIGHT_SPOT_DIR_X(x)							(0x0000120c+((x)*64))
+#define  NV34TCL_LIGHT_SPOT_DIR_X__SIZE							0x00000008
+#define  NV34TCL_LIGHT_SPOT_DIR_Y(x)							(0x00001210+((x)*64))
+#define  NV34TCL_LIGHT_SPOT_DIR_Y__SIZE							0x00000008
+#define  NV34TCL_LIGHT_SPOT_DIR_Z(x)							(0x00001214+((x)*64))
+#define  NV34TCL_LIGHT_SPOT_DIR_Z__SIZE							0x00000008
+#define  NV34TCL_LIGHT_SPOT_CUTOFF_D(x)							(0x00001218+((x)*64))
+#define  NV34TCL_LIGHT_SPOT_CUTOFF_D__SIZE						0x00000008
+#define  NV34TCL_LIGHT_POSITION_X(x)							(0x0000121c+((x)*64))
+#define  NV34TCL_LIGHT_POSITION_X__SIZE							0x00000008
+#define  NV34TCL_LIGHT_POSITION_Y(x)							(0x00001220+((x)*64))
+#define  NV34TCL_LIGHT_POSITION_Y__SIZE							0x00000008
+#define  NV34TCL_LIGHT_POSITION_Z(x)							(0x00001224+((x)*64))
+#define  NV34TCL_LIGHT_POSITION_Z__SIZE							0x00000008
+#define  NV34TCL_LIGHT_ATTENUATION_CONSTANT(x)						(0x00001228+((x)*64))
+#define  NV34TCL_LIGHT_ATTENUATION_CONSTANT__SIZE					0x00000008
+#define  NV34TCL_LIGHT_ATTENUATION_LINEAR(x)						(0x0000122c+((x)*64))
+#define  NV34TCL_LIGHT_ATTENUATION_LINEAR__SIZE						0x00000008
+#define  NV34TCL_LIGHT_ATTENUATION_QUADRATIC(x)						(0x00001230+((x)*64))
+#define  NV34TCL_LIGHT_ATTENUATION_QUADRATIC__SIZE					0x00000008
+#define  NV34TCL_FRONT_MATERIAL_SHININESS(x)						(0x00001400+((x)*4))
+#define  NV34TCL_FRONT_MATERIAL_SHININESS__SIZE						0x00000006
+#define  NV34TCL_ENABLED_LIGHTS								0x00001420
+#define  NV34TCL_FP_REG_CONTROL								0x00001450
+#define   NV34TCL_FP_REG_CONTROL_UNK1_SHIFT						16
+#define   NV34TCL_FP_REG_CONTROL_UNK1_MASK						0xffff0000
+#define   NV34TCL_FP_REG_CONTROL_UNK0_SHIFT						0
+#define   NV34TCL_FP_REG_CONTROL_UNK0_MASK						0x0000ffff
+#define  NV34TCL_VP_CLIP_PLANES_ENABLE							0x00001478
+#define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0						(1 <<  1)
+#define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1						(1 <<  5)
+#define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE2						(1 <<  9)
+#define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE3						(1 << 13)
+#define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE4						(1 << 17)
+#define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE5						(1 << 21)
+#define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE6						(1 << 25)
+#define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE7						(1 << 29)
+#define  NV34TCL_POLYGON_STIPPLE_ENABLE							0x0000147c
+#define  NV34TCL_POLYGON_STIPPLE_PATTERN(x)						(0x00001480+((x)*4))
+#define  NV34TCL_POLYGON_STIPPLE_PATTERN__SIZE						0x00000020
+#define  NV34TCL_VTX_ATTR_3F_X(x)							(0x00001500+((x)*16))
+#define  NV34TCL_VTX_ATTR_3F_X__SIZE							0x00000010
+#define  NV34TCL_VTX_ATTR_3F_Y(x)							(0x00001504+((x)*16))
+#define  NV34TCL_VTX_ATTR_3F_Y__SIZE							0x00000010
+#define  NV34TCL_VTX_ATTR_3F_Z(x)							(0x00001508+((x)*16))
+#define  NV34TCL_VTX_ATTR_3F_Z__SIZE							0x00000010
+#define  NV34TCL_VP_CLIP_PLANE_A(x)							(0x00001600+((x)*16))
+#define  NV34TCL_VP_CLIP_PLANE_A__SIZE							0x00000006
+#define  NV34TCL_VP_CLIP_PLANE_B(x)							(0x00001604+((x)*16))
+#define  NV34TCL_VP_CLIP_PLANE_B__SIZE							0x00000006
+#define  NV34TCL_VP_CLIP_PLANE_C(x)							(0x00001608+((x)*16))
+#define  NV34TCL_VP_CLIP_PLANE_C__SIZE							0x00000006
+#define  NV34TCL_VP_CLIP_PLANE_D(x)							(0x0000160c+((x)*16))
+#define  NV34TCL_VP_CLIP_PLANE_D__SIZE							0x00000006
+#define  NV34TCL_VTXBUF_ADDRESS(x)							(0x00001680+((x)*4))
+#define  NV34TCL_VTXBUF_ADDRESS__SIZE							0x00000010
+#define   NV34TCL_VTXBUF_ADDRESS_DMA1							(1 << 31)
+#define   NV34TCL_VTXBUF_ADDRESS_OFFSET_SHIFT						0
+#define   NV34TCL_VTXBUF_ADDRESS_OFFSET_MASK						0x0fffffff
+#define  NV34TCL_VTXFMT(x)								(0x00001740+((x)*4))
+#define  NV34TCL_VTXFMT__SIZE								0x00000010
+#define   NV34TCL_VTXFMT_TYPE_SHIFT							0
+#define   NV34TCL_VTXFMT_TYPE_MASK							0x0000000f
+#define    NV34TCL_VTXFMT_TYPE_FLOAT							0x00000002
+#define    NV34TCL_VTXFMT_TYPE_UBYTE							0x00000004
+#define    NV34TCL_VTXFMT_TYPE_USHORT							0x00000005
+#define   NV34TCL_VTXFMT_SIZE_SHIFT							4
+#define   NV34TCL_VTXFMT_SIZE_MASK							0x000000f0
+#define   NV34TCL_VTXFMT_STRIDE_SHIFT							8
+#define   NV34TCL_VTXFMT_STRIDE_MASK							0x0000ff00
+#define  NV34TCL_LIGHT_MODEL_BACK_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_R			0x000017a0
+#define  NV34TCL_LIGHT_MODEL_BACK_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_G			0x000017a4
+#define  NV34TCL_LIGHT_MODEL_BACK_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_B			0x000017a8
+#define  NV34TCL_COLOR_MATERIAL_BACK_R							0x000017b0
+#define  NV34TCL_COLOR_MATERIAL_BACK_G							0x000017b4
+#define  NV34TCL_COLOR_MATERIAL_BACK_B							0x000017b8
+#define  NV34TCL_COLOR_MATERIAL_BACK_A							0x000017c0
+#define  NV34TCL_QUERY_RESET								0x000017c8
+#define  NV34TCL_QUERY_UNK17CC								0x000017cc
+#define  NV34TCL_QUERY_GET								0x00001800
+#define   NV34TCL_QUERY_GET_UNK24_SHIFT							24
+#define   NV34TCL_QUERY_GET_UNK24_MASK							0xff000000
+#define   NV34TCL_QUERY_GET_OFFSET_SHIFT						0
+#define   NV34TCL_QUERY_GET_OFFSET_MASK							0x00ffffff
+#define  NV34TCL_VERTEX_BEGIN_END							0x00001808
+#define   NV34TCL_VERTEX_BEGIN_END_STOP							0x00000000
+#define   NV34TCL_VERTEX_BEGIN_END_POINTS						0x00000001
+#define   NV34TCL_VERTEX_BEGIN_END_LINES						0x00000002
+#define   NV34TCL_VERTEX_BEGIN_END_LINE_LOOP						0x00000003
+#define   NV34TCL_VERTEX_BEGIN_END_LINE_STRIP						0x00000004
+#define   NV34TCL_VERTEX_BEGIN_END_TRIANGLES						0x00000005
+#define   NV34TCL_VERTEX_BEGIN_END_TRIANGLE_STRIP					0x00000006
+#define   NV34TCL_VERTEX_BEGIN_END_TRIANGLE_FAN						0x00000007
+#define   NV34TCL_VERTEX_BEGIN_END_QUADS						0x00000008
+#define   NV34TCL_VERTEX_BEGIN_END_QUAD_STRIP						0x00000009
+#define   NV34TCL_VERTEX_BEGIN_END_POLYGON						0x0000000a
+#define  NV34TCL_VB_ELEMENT_U16								0x0000180c
+#define   NV34TCL_VB_ELEMENT_U16_I0_SHIFT						0
+#define   NV34TCL_VB_ELEMENT_U16_I0_MASK						0x0000ffff
+#define   NV34TCL_VB_ELEMENT_U16_I1_SHIFT						16
+#define   NV34TCL_VB_ELEMENT_U16_I1_MASK						0xffff0000
+#define  NV34TCL_VB_ELEMENT_U32								0x00001810
+#define  NV34TCL_VB_VERTEX_BATCH							0x00001814
+#define   NV34TCL_VB_VERTEX_BATCH_OFFSET_SHIFT						0
+#define   NV34TCL_VB_VERTEX_BATCH_OFFSET_MASK						0x00ffffff
+#define   NV34TCL_VB_VERTEX_BATCH_COUNT_SHIFT						24
+#define   NV34TCL_VB_VERTEX_BATCH_COUNT_MASK						0xff000000
+#define  NV34TCL_VERTEX_DATA								0x00001818
+#define  NV34TCL_IDXBUF_ADDRESS								0x0000181c
+#define  NV34TCL_IDXBUF_FORMAT								0x00001820
+#define   NV34TCL_IDXBUF_FORMAT_TYPE_SHIFT						4
+#define   NV34TCL_IDXBUF_FORMAT_TYPE_MASK						0x000000f0
+#define    NV34TCL_IDXBUF_FORMAT_TYPE_U32						0x00000000
+#define    NV34TCL_IDXBUF_FORMAT_TYPE_U16						0x00000010
+#define   NV34TCL_IDXBUF_FORMAT_DMA1							(1 <<  0)
+#define  NV34TCL_VB_INDEX_BATCH								0x00001824
+#define   NV34TCL_VB_INDEX_BATCH_COUNT_SHIFT						24
+#define   NV34TCL_VB_INDEX_BATCH_COUNT_MASK						0xff000000
+#define   NV34TCL_VB_INDEX_BATCH_START_SHIFT						0
+#define   NV34TCL_VB_INDEX_BATCH_START_MASK						0x00ffffff
+#define  NV34TCL_POLYGON_MODE_FRONT							0x00001828
+#define   NV34TCL_POLYGON_MODE_FRONT_POINT						0x00001b00
+#define   NV34TCL_POLYGON_MODE_FRONT_LINE						0x00001b01
+#define   NV34TCL_POLYGON_MODE_FRONT_FILL						0x00001b02
+#define  NV34TCL_POLYGON_MODE_BACK							0x0000182c
+#define   NV34TCL_POLYGON_MODE_BACK_POINT						0x00001b00
+#define   NV34TCL_POLYGON_MODE_BACK_LINE						0x00001b01
+#define   NV34TCL_POLYGON_MODE_BACK_FILL						0x00001b02
+#define  NV34TCL_CULL_FACE								0x00001830
+#define   NV34TCL_CULL_FACE_FRONT							0x00000404
+#define   NV34TCL_CULL_FACE_BACK							0x00000405
+#define   NV34TCL_CULL_FACE_FRONT_AND_BACK						0x00000408
+#define  NV34TCL_FRONT_FACE								0x00001834
+#define   NV34TCL_FRONT_FACE_CW								0x00000900
+#define   NV34TCL_FRONT_FACE_CCW							0x00000901
+#define  NV34TCL_POLYGON_SMOOTH_ENABLE							0x00001838
+#define  NV34TCL_CULL_FACE_ENABLE							0x0000183c
+#define  NV34TCL_TX_PALETTE_OFFSET(x)							(0x00001840+((x)*4))
+#define  NV34TCL_TX_PALETTE_OFFSET__SIZE						0x00000004
+#define  NV34TCL_VTX_ATTR_2F_X(x)							(0x00001880+((x)*8))
+#define  NV34TCL_VTX_ATTR_2F_X__SIZE							0x00000010
+#define  NV34TCL_VTX_ATTR_2F_Y(x)							(0x00001884+((x)*8))
+#define  NV34TCL_VTX_ATTR_2F_Y__SIZE							0x00000010
+#define  NV34TCL_VTX_ATTR_2I(x)								(0x00001900+((x)*4))
+#define  NV34TCL_VTX_ATTR_2I__SIZE							0x00000010
+#define   NV34TCL_VTX_ATTR_2I_X_SHIFT							0
+#define   NV34TCL_VTX_ATTR_2I_X_MASK							0x0000ffff
+#define   NV34TCL_VTX_ATTR_2I_Y_SHIFT							16
+#define   NV34TCL_VTX_ATTR_2I_Y_MASK							0xffff0000
+#define  NV34TCL_VTX_ATTR_4UB(x)							(0x00001940+((x)*4))
+#define  NV34TCL_VTX_ATTR_4UB__SIZE							0x00000010
+#define   NV34TCL_VTX_ATTR_4UB_X_SHIFT							0
+#define   NV34TCL_VTX_ATTR_4UB_X_MASK							0x000000ff
+#define   NV34TCL_VTX_ATTR_4UB_Y_SHIFT							8
+#define   NV34TCL_VTX_ATTR_4UB_Y_MASK							0x0000ff00
+#define   NV34TCL_VTX_ATTR_4UB_Z_SHIFT							16
+#define   NV34TCL_VTX_ATTR_4UB_Z_MASK							0x00ff0000
+#define   NV34TCL_VTX_ATTR_4UB_W_SHIFT							24
+#define   NV34TCL_VTX_ATTR_4UB_W_MASK							0xff000000
+#define  NV34TCL_VTX_ATTR_4I_XY(x)							(0x00001980+((x)*8))
+#define  NV34TCL_VTX_ATTR_4I_XY__SIZE							0x00000010
+#define   NV34TCL_VTX_ATTR_4I_XY_X_SHIFT						0
+#define   NV34TCL_VTX_ATTR_4I_XY_X_MASK							0x0000ffff
+#define   NV34TCL_VTX_ATTR_4I_XY_Y_SHIFT						16
+#define   NV34TCL_VTX_ATTR_4I_XY_Y_MASK							0xffff0000
+#define  NV34TCL_VTX_ATTR_4I_ZW(x)							(0x00001984+((x)*8))
+#define  NV34TCL_VTX_ATTR_4I_ZW__SIZE							0x00000010
+#define   NV34TCL_VTX_ATTR_4I_ZW_Z_SHIFT						0
+#define   NV34TCL_VTX_ATTR_4I_ZW_Z_MASK							0x0000ffff
+#define   NV34TCL_VTX_ATTR_4I_ZW_W_SHIFT						16
+#define   NV34TCL_VTX_ATTR_4I_ZW_W_MASK							0xffff0000
+#define  NV34TCL_TX_OFFSET(x)								(0x00001a00+((x)*32))
+#define  NV34TCL_TX_OFFSET__SIZE							0x00000004
+#define  NV34TCL_TX_FORMAT(x)								(0x00001a04+((x)*32))
+#define  NV34TCL_TX_FORMAT__SIZE							0x00000004
+#define   NV34TCL_TX_FORMAT_DMA0							(1 <<  0)
+#define   NV34TCL_TX_FORMAT_DMA1							(1 <<  1)
+#define   NV34TCL_TX_FORMAT_CUBIC							(1 <<  2)
+#define   NV34TCL_TX_FORMAT_NO_BORDER							(1 <<  3)
+#define   NV34TCL_TX_FORMAT_DIMS_SHIFT							4
+#define   NV34TCL_TX_FORMAT_DIMS_MASK							0x000000f0
+#define    NV34TCL_TX_FORMAT_DIMS_1D							0x00000010
+#define    NV34TCL_TX_FORMAT_DIMS_2D							0x00000020
+#define    NV34TCL_TX_FORMAT_DIMS_3D							0x00000030
+#define   NV34TCL_TX_FORMAT_FORMAT_SHIFT						8
+#define   NV34TCL_TX_FORMAT_FORMAT_MASK							0x0000ff00
+#define    NV34TCL_TX_FORMAT_FORMAT_L8							0x00000000
+#define    NV34TCL_TX_FORMAT_FORMAT_A8							0x00000100
+#define    NV34TCL_TX_FORMAT_FORMAT_A1R5G5B5						0x00000200
+#define    NV34TCL_TX_FORMAT_FORMAT_A8_RECT						0x00000300
+#define    NV34TCL_TX_FORMAT_FORMAT_A4R4G4B4						0x00000400
+#define    NV34TCL_TX_FORMAT_FORMAT_R5G6B5						0x00000500
+#define    NV34TCL_TX_FORMAT_FORMAT_A8R8G8B8						0x00000600
+#define    NV34TCL_TX_FORMAT_FORMAT_X8R8G8B8						0x00000700
+#define    NV34TCL_TX_FORMAT_FORMAT_INDEX8						0x00000b00
+#define    NV34TCL_TX_FORMAT_FORMAT_DXT1						0x00000c00
+#define    NV34TCL_TX_FORMAT_FORMAT_DXT3						0x00000e00
+#define    NV34TCL_TX_FORMAT_FORMAT_DXT5						0x00000f00
+#define    NV34TCL_TX_FORMAT_FORMAT_A1R5G5B5_RECT					0x00001000
+#define    NV34TCL_TX_FORMAT_FORMAT_R5G6B5_RECT						0x00001100
+#define    NV34TCL_TX_FORMAT_FORMAT_A8R8G8B8_RECT					0x00001200
+#define    NV34TCL_TX_FORMAT_FORMAT_L8_RECT						0x00001300
+#define    NV34TCL_TX_FORMAT_FORMAT_A8L8						0x00001a00
+#define    NV34TCL_TX_FORMAT_FORMAT_A8_RECT2						0x00001b00
+#define    NV34TCL_TX_FORMAT_FORMAT_A4R4G4B4_RECT					0x00001d00
+#define    NV34TCL_TX_FORMAT_FORMAT_R8G8B8_RECT						0x00001e00
+#define    NV34TCL_TX_FORMAT_FORMAT_L8A8_RECT						0x00002000
+#define    NV34TCL_TX_FORMAT_FORMAT_A16							0x00003200
+#define    NV34TCL_TX_FORMAT_FORMAT_HILO16						0x00003300
+#define    NV34TCL_TX_FORMAT_FORMAT_A16_RECT						0x00003500
+#define    NV34TCL_TX_FORMAT_FORMAT_HILO16_RECT						0x00003600
+#define    NV34TCL_TX_FORMAT_FORMAT_HILO8						0x00004400
+#define    NV34TCL_TX_FORMAT_FORMAT_SIGNED_HILO8					0x00004500
+#define    NV34TCL_TX_FORMAT_FORMAT_HILO8_RECT						0x00004600
+#define    NV34TCL_TX_FORMAT_FORMAT_SIGNED_HILO8_RECT					0x00004700
+#define    NV34TCL_TX_FORMAT_FORMAT_FLOAT_RGBA16_NV					0x00004a00
+#define    NV34TCL_TX_FORMAT_FORMAT_FLOAT_RGBA32_NV					0x00004b00
+#define    NV34TCL_TX_FORMAT_FORMAT_FLOAT_R32_NV					0x00004c00
+#define   NV34TCL_TX_FORMAT_MIPMAP							(1 << 19)
+#define   NV34TCL_TX_FORMAT_BASE_SIZE_U_SHIFT						20
+#define   NV34TCL_TX_FORMAT_BASE_SIZE_U_MASK						0x00f00000
+#define   NV34TCL_TX_FORMAT_BASE_SIZE_V_SHIFT						24
+#define   NV34TCL_TX_FORMAT_BASE_SIZE_V_MASK						0x0f000000
+#define   NV34TCL_TX_FORMAT_BASE_SIZE_W_SHIFT						28
+#define   NV34TCL_TX_FORMAT_BASE_SIZE_W_MASK						0xf0000000
+#define  NV34TCL_TX_WRAP(x)								(0x00001a08+((x)*32))
+#define  NV34TCL_TX_WRAP__SIZE								0x00000004
+#define   NV34TCL_TX_WRAP_S_SHIFT							0
+#define   NV34TCL_TX_WRAP_S_MASK							0x000000ff
+#define    NV34TCL_TX_WRAP_S_REPEAT							0x00000001
+#define    NV34TCL_TX_WRAP_S_MIRRORED_REPEAT						0x00000002
+#define    NV34TCL_TX_WRAP_S_CLAMP_TO_EDGE						0x00000003
+#define    NV34TCL_TX_WRAP_S_CLAMP_TO_BORDER						0x00000004
+#define    NV34TCL_TX_WRAP_S_CLAMP							0x00000005
+#define   NV34TCL_TX_WRAP_T_SHIFT							8
+#define   NV34TCL_TX_WRAP_T_MASK							0x00000f00
+#define    NV34TCL_TX_WRAP_T_REPEAT							0x00000100
+#define    NV34TCL_TX_WRAP_T_MIRRORED_REPEAT						0x00000200
+#define    NV34TCL_TX_WRAP_T_CLAMP_TO_EDGE						0x00000300
+#define    NV34TCL_TX_WRAP_T_CLAMP_TO_BORDER						0x00000400
+#define    NV34TCL_TX_WRAP_T_CLAMP							0x00000500
+#define   NV34TCL_TX_WRAP_EXPAND_NORMAL_SHIFT						12
+#define   NV34TCL_TX_WRAP_EXPAND_NORMAL_MASK						0x0000f000
+#define   NV34TCL_TX_WRAP_R_SHIFT							16
+#define   NV34TCL_TX_WRAP_R_MASK							0x00ff0000
+#define    NV34TCL_TX_WRAP_R_REPEAT							0x00010000
+#define    NV34TCL_TX_WRAP_R_MIRRORED_REPEAT						0x00020000
+#define    NV34TCL_TX_WRAP_R_CLAMP_TO_EDGE						0x00030000
+#define    NV34TCL_TX_WRAP_R_CLAMP_TO_BORDER						0x00040000
+#define    NV34TCL_TX_WRAP_R_CLAMP							0x00050000
+#define   NV34TCL_TX_WRAP_RCOMP_SHIFT							28
+#define   NV34TCL_TX_WRAP_RCOMP_MASK							0xf0000000
+#define    NV34TCL_TX_WRAP_RCOMP_NEVER							0x00000000
+#define    NV34TCL_TX_WRAP_RCOMP_GREATER						0x10000000
+#define    NV34TCL_TX_WRAP_RCOMP_EQUAL							0x20000000
+#define    NV34TCL_TX_WRAP_RCOMP_GEQUAL							0x30000000
+#define    NV34TCL_TX_WRAP_RCOMP_LESS							0x40000000
+#define    NV34TCL_TX_WRAP_RCOMP_NOTEQUAL						0x50000000
+#define    NV34TCL_TX_WRAP_RCOMP_LEQUAL							0x60000000
+#define    NV34TCL_TX_WRAP_RCOMP_ALWAYS							0x70000000
+#define  NV34TCL_TX_ENABLE(x)								(0x00001a0c+((x)*32))
+#define  NV34TCL_TX_ENABLE__SIZE							0x00000004
+#define   NV34TCL_TX_ENABLE_ANISO_SHIFT							4
+#define   NV34TCL_TX_ENABLE_ANISO_MASK							0x00000030
+#define    NV34TCL_TX_ENABLE_ANISO_NONE							0x00000000
+#define    NV34TCL_TX_ENABLE_ANISO_2X							0x00000010
+#define    NV34TCL_TX_ENABLE_ANISO_4X							0x00000020
+#define    NV34TCL_TX_ENABLE_ANISO_8X							0x00000030
+#define   NV34TCL_TX_ENABLE_MIPMAP_MAX_LOD_SHIFT					14
+#define   NV34TCL_TX_ENABLE_MIPMAP_MAX_LOD_MASK						0x0003c000
+#define   NV34TCL_TX_ENABLE_MIPMAP_MIN_LOD_SHIFT					26
+#define   NV34TCL_TX_ENABLE_MIPMAP_MIN_LOD_MASK						0x3c000000
+#define   NV34TCL_TX_ENABLE_ENABLE							(1 << 30)
+#define  NV34TCL_TX_SWIZZLE(x)								(0x00001a10+((x)*32))
+#define  NV34TCL_TX_SWIZZLE__SIZE							0x00000004
+#define   NV34TCL_TX_SWIZZLE_S0_X_SHIFT							14
+#define   NV34TCL_TX_SWIZZLE_S0_X_MASK							0x0000c000
+#define    NV34TCL_TX_SWIZZLE_S0_X_ZERO							0x00000000
+#define    NV34TCL_TX_SWIZZLE_S0_X_ONE							0x00004000
+#define    NV34TCL_TX_SWIZZLE_S0_X_S1							0x00008000
+#define   NV34TCL_TX_SWIZZLE_S0_Y_SHIFT							12
+#define   NV34TCL_TX_SWIZZLE_S0_Y_MASK							0x00003000
+#define    NV34TCL_TX_SWIZZLE_S0_Y_ZERO							0x00000000
+#define    NV34TCL_TX_SWIZZLE_S0_Y_ONE							0x00001000
+#define    NV34TCL_TX_SWIZZLE_S0_Y_S1							0x00002000
+#define   NV34TCL_TX_SWIZZLE_S0_Z_SHIFT							10
+#define   NV34TCL_TX_SWIZZLE_S0_Z_MASK							0x00000c00
+#define    NV34TCL_TX_SWIZZLE_S0_Z_ZERO							0x00000000
+#define    NV34TCL_TX_SWIZZLE_S0_Z_ONE							0x00000400
+#define    NV34TCL_TX_SWIZZLE_S0_Z_S1							0x00000800
+#define   NV34TCL_TX_SWIZZLE_S0_W_SHIFT							8
+#define   NV34TCL_TX_SWIZZLE_S0_W_MASK							0x00000300
+#define    NV34TCL_TX_SWIZZLE_S0_W_ZERO							0x00000000
+#define    NV34TCL_TX_SWIZZLE_S0_W_ONE							0x00000100
+#define    NV34TCL_TX_SWIZZLE_S0_W_S1							0x00000200
+#define   NV34TCL_TX_SWIZZLE_S1_X_SHIFT							6
+#define   NV34TCL_TX_SWIZZLE_S1_X_MASK							0x000000c0
+#define    NV34TCL_TX_SWIZZLE_S1_X_W							0x00000000
+#define    NV34TCL_TX_SWIZZLE_S1_X_Z							0x00000040
+#define    NV34TCL_TX_SWIZZLE_S1_X_Y							0x00000080
+#define    NV34TCL_TX_SWIZZLE_S1_X_X							0x000000c0
+#define   NV34TCL_TX_SWIZZLE_S1_Y_SHIFT							4
+#define   NV34TCL_TX_SWIZZLE_S1_Y_MASK							0x00000030
+#define    NV34TCL_TX_SWIZZLE_S1_Y_W							0x00000000
+#define    NV34TCL_TX_SWIZZLE_S1_Y_Z							0x00000010
+#define    NV34TCL_TX_SWIZZLE_S1_Y_Y							0x00000020
+#define    NV34TCL_TX_SWIZZLE_S1_Y_X							0x00000030
+#define   NV34TCL_TX_SWIZZLE_S1_Z_SHIFT							2
+#define   NV34TCL_TX_SWIZZLE_S1_Z_MASK							0x0000000c
+#define    NV34TCL_TX_SWIZZLE_S1_Z_W							0x00000000
+#define    NV34TCL_TX_SWIZZLE_S1_Z_Z							0x00000004
+#define    NV34TCL_TX_SWIZZLE_S1_Z_Y							0x00000008
+#define    NV34TCL_TX_SWIZZLE_S1_Z_X							0x0000000c
+#define   NV34TCL_TX_SWIZZLE_S1_W_SHIFT							0
+#define   NV34TCL_TX_SWIZZLE_S1_W_MASK							0x00000003
+#define    NV34TCL_TX_SWIZZLE_S1_W_W							0x00000000
+#define    NV34TCL_TX_SWIZZLE_S1_W_Z							0x00000001
+#define    NV34TCL_TX_SWIZZLE_S1_W_Y							0x00000002
+#define    NV34TCL_TX_SWIZZLE_S1_W_X							0x00000003
+#define   NV34TCL_TX_SWIZZLE_RECT_PITCH_SHIFT						16
+#define   NV34TCL_TX_SWIZZLE_RECT_PITCH_MASK						0xffff0000
+#define  NV34TCL_TX_FILTER(x)								(0x00001a14+((x)*32))
+#define  NV34TCL_TX_FILTER__SIZE							0x00000004
+#define   NV34TCL_TX_FILTER_LOD_BIAS_SHIFT						8
+#define   NV34TCL_TX_FILTER_LOD_BIAS_MASK						0x00000f00
+#define   NV34TCL_TX_FILTER_MINIFY_SHIFT						16
+#define   NV34TCL_TX_FILTER_MINIFY_MASK							0x000f0000
+#define    NV34TCL_TX_FILTER_MINIFY_NEAREST						0x00010000
+#define    NV34TCL_TX_FILTER_MINIFY_LINEAR						0x00020000
+#define    NV34TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST				0x00030000
+#define    NV34TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST				0x00040000
+#define    NV34TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR				0x00050000
+#define    NV34TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR				0x00060000
+#define   NV34TCL_TX_FILTER_MAGNIFY_SHIFT						24
+#define   NV34TCL_TX_FILTER_MAGNIFY_MASK						0x0f000000
+#define    NV34TCL_TX_FILTER_MAGNIFY_NEAREST						0x01000000
+#define    NV34TCL_TX_FILTER_MAGNIFY_LINEAR						0x02000000
+#define   NV34TCL_TX_FILTER_SIGNED_BLUE							(1 << 28)
+#define   NV34TCL_TX_FILTER_SIGNED_GREEN						(1 << 29)
+#define   NV34TCL_TX_FILTER_SIGNED_RED							(1 << 30)
+#define   NV34TCL_TX_FILTER_SIGNED_ALPHA						(1 << 31)
+#define  NV34TCL_TX_NPOT_SIZE(x)							(0x00001a18+((x)*32))
+#define  NV34TCL_TX_NPOT_SIZE__SIZE							0x00000004
+#define   NV34TCL_TX_NPOT_SIZE_H_SHIFT							0
+#define   NV34TCL_TX_NPOT_SIZE_H_MASK							0x0000ffff
+#define   NV34TCL_TX_NPOT_SIZE_W_SHIFT							16
+#define   NV34TCL_TX_NPOT_SIZE_W_MASK							0xffff0000
+#define  NV34TCL_TX_BORDER_COLOR(x)							(0x00001a1c+((x)*32))
+#define  NV34TCL_TX_BORDER_COLOR__SIZE							0x00000004
+#define   NV34TCL_TX_BORDER_COLOR_B_SHIFT						0
+#define   NV34TCL_TX_BORDER_COLOR_B_MASK						0x000000ff
+#define   NV34TCL_TX_BORDER_COLOR_G_SHIFT						8
+#define   NV34TCL_TX_BORDER_COLOR_G_MASK						0x0000ff00
+#define   NV34TCL_TX_BORDER_COLOR_R_SHIFT						16
+#define   NV34TCL_TX_BORDER_COLOR_R_MASK						0x00ff0000
+#define   NV34TCL_TX_BORDER_COLOR_A_SHIFT						24
+#define   NV34TCL_TX_BORDER_COLOR_A_MASK						0xff000000
+#define  NV34TCL_VTX_ATTR_4F_X(x)							(0x00001c00+((x)*16))
+#define  NV34TCL_VTX_ATTR_4F_X__SIZE							0x00000010
+#define  NV34TCL_VTX_ATTR_4F_Y(x)							(0x00001c04+((x)*16))
+#define  NV34TCL_VTX_ATTR_4F_Y__SIZE							0x00000010
+#define  NV34TCL_VTX_ATTR_4F_Z(x)							(0x00001c08+((x)*16))
+#define  NV34TCL_VTX_ATTR_4F_Z__SIZE							0x00000010
+#define  NV34TCL_VTX_ATTR_4F_W(x)							(0x00001c0c+((x)*16))
+#define  NV34TCL_VTX_ATTR_4F_W__SIZE							0x00000010
+#define  NV34TCL_FP_CONTROL								0x00001d60
+#define   NV34TCL_FP_CONTROL_USES_KIL							(1 <<  7)
+#define   NV34TCL_FP_CONTROL_USED_REGS_MINUS1_DIV2_SHIFT				0
+#define   NV34TCL_FP_CONTROL_USED_REGS_MINUS1_DIV2_MASK					0x0000000f
+#define  NV34TCL_MULTISAMPLE_CONTROL							0x00001d7c
+#define   NV34TCL_MULTISAMPLE_CONTROL_ENABLE						(1 <<  0)
+#define   NV34TCL_MULTISAMPLE_CONTROL_SAMPLE_ALPHA_TO_COVERAGE				(1 <<  4)
+#define   NV34TCL_MULTISAMPLE_CONTROL_SAMPLE_ALPHA_TO_ONE				(1 <<  8)
+#define   NV34TCL_MULTISAMPLE_CONTROL_SAMPLE_COVERAGE_SHIFT				16
+#define   NV34TCL_MULTISAMPLE_CONTROL_SAMPLE_COVERAGE_MASK				0xffff0000
+#define  NV34TCL_CLEAR_DEPTH_VALUE							0x00001d8c
+#define  NV34TCL_CLEAR_COLOR_VALUE							0x00001d90
+#define   NV34TCL_CLEAR_COLOR_VALUE_B_SHIFT						0
+#define   NV34TCL_CLEAR_COLOR_VALUE_B_MASK						0x000000ff
+#define   NV34TCL_CLEAR_COLOR_VALUE_G_SHIFT						8
+#define   NV34TCL_CLEAR_COLOR_VALUE_G_MASK						0x0000ff00
+#define   NV34TCL_CLEAR_COLOR_VALUE_R_SHIFT						16
+#define   NV34TCL_CLEAR_COLOR_VALUE_R_MASK						0x00ff0000
+#define   NV34TCL_CLEAR_COLOR_VALUE_A_SHIFT						24
+#define   NV34TCL_CLEAR_COLOR_VALUE_A_MASK						0xff000000
+#define  NV34TCL_CLEAR_BUFFERS								0x00001d94
+#define   NV34TCL_CLEAR_BUFFERS_COLOR_A							(1 <<  7)
+#define   NV34TCL_CLEAR_BUFFERS_COLOR_B							(1 <<  6)
+#define   NV34TCL_CLEAR_BUFFERS_COLOR_G							(1 <<  5)
+#define   NV34TCL_CLEAR_BUFFERS_COLOR_R							(1 <<  4)
+#define   NV34TCL_CLEAR_BUFFERS_STENCIL							(1 <<  1)
+#define   NV34TCL_CLEAR_BUFFERS_DEPTH							(1 <<  0)
+#define  NV34TCL_DO_VERTICES								0x00001dac
+#define  NV34TCL_LINE_STIPPLE_ENABLE							0x00001db4
+#define  NV34TCL_LINE_STIPPLE_PATTERN							0x00001db8
+#define   NV34TCL_LINE_STIPPLE_PATTERN_FACTOR_SHIFT					0
+#define   NV34TCL_LINE_STIPPLE_PATTERN_FACTOR_MASK					0x0000ffff
+#define   NV34TCL_LINE_STIPPLE_PATTERN_PATTERN_SHIFT					16
+#define   NV34TCL_LINE_STIPPLE_PATTERN_PATTERN_MASK					0xffff0000
+#define  NV34TCL_BACK_MATERIAL_SHININESS(x)						(0x00001e20+((x)*4))
+#define  NV34TCL_BACK_MATERIAL_SHININESS__SIZE						0x00000006
+#define  NV34TCL_VTX_ATTR_1F(x)								(0x00001e40+((x)*4))
+#define  NV34TCL_VTX_ATTR_1F__SIZE							0x00000010
+#define  NV34TCL_VP_UPLOAD_FROM_ID							0x00001e9c
+#define  NV34TCL_VP_START_FROM_ID							0x00001ea0
+#define  NV34TCL_POINT_PARAMETERS(x)							(0x00001ec0+((x)*4))
+#define  NV34TCL_POINT_PARAMETERS__SIZE							0x00000008
+#define  NV34TCL_POINT_SIZE								0x00001ee0
+#define  NV34TCL_POINT_PARAMETERS_ENABLE						0x00001ee4
+#define  NV34TCL_POINT_SPRITE								0x00001ee8
+#define   NV34TCL_POINT_SPRITE_ENABLE							(1 <<  0)
+#define   NV34TCL_POINT_SPRITE_R_MODE_SHIFT						1
+#define   NV34TCL_POINT_SPRITE_R_MODE_MASK						0x00000006
+#define    NV34TCL_POINT_SPRITE_R_MODE_GL_ZERO						0x00000000
+#define    NV34TCL_POINT_SPRITE_R_MODE_GL_R						0x00000002
+#define    NV34TCL_POINT_SPRITE_R_MODE_GL_S						0x00000004
+#define   NV34TCL_POINT_SPRITE_COORD_REPLACE						(1 << 11)
+#define  NV34TCL_VP_UPLOAD_CONST_ID							0x00001efc
+#define  NV34TCL_VP_UPLOAD_CONST_X(x)							(0x00001f00+((x)*16))
+#define  NV34TCL_VP_UPLOAD_CONST_X__SIZE						0x00000004
+#define  NV34TCL_VP_UPLOAD_CONST_Y(x)							(0x00001f04+((x)*16))
+#define  NV34TCL_VP_UPLOAD_CONST_Y__SIZE						0x00000004
+#define  NV34TCL_VP_UPLOAD_CONST_Z(x)							(0x00001f08+((x)*16))
+#define  NV34TCL_VP_UPLOAD_CONST_Z__SIZE						0x00000004
+#define  NV34TCL_VP_UPLOAD_CONST_W(x)							(0x00001f0c+((x)*16))
+#define  NV34TCL_VP_UPLOAD_CONST_W__SIZE						0x00000004
+#define  NV34TCL_UNK1f80(x)								(0x00001f80+((x)*4))
+#define  NV34TCL_UNK1f80__SIZE								0x00000010
+
+
+#define NV40_CONTEXT_SURFACES_2D							0x00003062
+
+
+
+#define NV40_STRETCHED_IMAGE_FROM_CPU							0x00003066
+
+
+
+#define NV40_TEXTURE_FROM_CPU								0x0000307b
+
+
+
+#define NV40_SCALED_IMAGE_FROM_MEMORY							0x00003089
+
+
+
+#define NV40_IMAGE_FROM_CPU								0x0000308a
+
+
+
+#define NV40_SWIZZLED_SURFACE								0x0000309e
+
+
+
+#define NV40TCL										0x00004097
+
+#define  NV40TCL_REF_CNT								0x00000050
+#define  NV40TCL_NOP									0x00000100
+#define  NV40TCL_NOTIFY									0x00000104
+#define  NV40TCL_DMA_NOTIFY								0x00000180
+#define  NV40TCL_DMA_TEXTURE0								0x00000184
+#define  NV40TCL_DMA_TEXTURE1								0x00000188
+#define  NV40TCL_DMA_COLOR1								0x0000018c
+#define  NV40TCL_DMA_COLOR0								0x00000194
+#define  NV40TCL_DMA_ZETA								0x00000198
+#define  NV40TCL_DMA_VTXBUF0								0x0000019c
+#define  NV40TCL_DMA_VTXBUF1								0x000001a0
+#define  NV40TCL_DMA_FENCE								0x000001a4
+#define  NV40TCL_DMA_QUERY								0x000001a8
+#define  NV40TCL_DMA_UNK01AC								0x000001ac
+#define  NV40TCL_DMA_UNK01B0								0x000001b0
+#define  NV40TCL_DMA_COLOR2								0x000001b4
+#define  NV40TCL_DMA_COLOR3								0x000001b8
+#define  NV40TCL_RT_HORIZ								0x00000200
+#define   NV40TCL_RT_HORIZ_W_SHIFT							16
+#define   NV40TCL_RT_HORIZ_W_MASK							0xffff0000
+#define   NV40TCL_RT_HORIZ_X_SHIFT							0
+#define   NV40TCL_RT_HORIZ_X_MASK							0x0000ffff
+#define  NV40TCL_RT_VERT								0x00000204
+#define   NV40TCL_RT_VERT_H_SHIFT							16
+#define   NV40TCL_RT_VERT_H_MASK							0xffff0000
+#define   NV40TCL_RT_VERT_Y_SHIFT							0
+#define   NV40TCL_RT_VERT_Y_MASK							0x0000ffff
+#define  NV40TCL_RT_FORMAT								0x00000208
+#define   NV40TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT						24
+#define   NV40TCL_RT_FORMAT_LOG2_HEIGHT_MASK						0xff000000
+#define   NV40TCL_RT_FORMAT_LOG2_WIDTH_SHIFT						16
+#define   NV40TCL_RT_FORMAT_LOG2_WIDTH_MASK						0x00ff0000
+#define   NV40TCL_RT_FORMAT_TYPE_SHIFT							8
+#define   NV40TCL_RT_FORMAT_TYPE_MASK							0x00000f00
+#define    NV40TCL_RT_FORMAT_TYPE_LINEAR						0x00000100
+#define    NV40TCL_RT_FORMAT_TYPE_SWIZZLED						0x00000200
+#define   NV40TCL_RT_FORMAT_ZETA_SHIFT							5
+#define   NV40TCL_RT_FORMAT_ZETA_MASK							0x000000e0
+#define    NV40TCL_RT_FORMAT_ZETA_Z16							0x00000020
+#define    NV40TCL_RT_FORMAT_ZETA_Z24S8							0x00000040
+#define   NV40TCL_RT_FORMAT_COLOR_SHIFT							0
+#define   NV40TCL_RT_FORMAT_COLOR_MASK							0x0000001f
+#define    NV40TCL_RT_FORMAT_COLOR_R5G6B5						0x00000003
+#define    NV40TCL_RT_FORMAT_COLOR_X8R8G8B8						0x00000005
+#define    NV40TCL_RT_FORMAT_COLOR_A8R8G8B8						0x00000008
+#define    NV40TCL_RT_FORMAT_COLOR_B8							0x00000009
+#define    NV40TCL_RT_FORMAT_COLOR_X8B8G8R8						0x0000000f
+#define    NV40TCL_RT_FORMAT_COLOR_A8B8G8R8						0x00000010
+#define  NV40TCL_COLOR0_PITCH								0x0000020c
+#define  NV40TCL_COLOR0_OFFSET								0x00000210
+#define  NV40TCL_ZETA_OFFSET								0x00000214
+#define  NV40TCL_COLOR1_OFFSET								0x00000218
+#define  NV40TCL_COLOR1_PITCH								0x0000021c
+#define  NV40TCL_RT_ENABLE								0x00000220
+#define   NV40TCL_RT_ENABLE_MRT								(1 <<  4)
+#define   NV40TCL_RT_ENABLE_COLOR3							(1 <<  3)
+#define   NV40TCL_RT_ENABLE_COLOR2							(1 <<  2)
+#define   NV40TCL_RT_ENABLE_COLOR1							(1 <<  1)
+#define   NV40TCL_RT_ENABLE_COLOR0							(1 <<  0)
+#define  NV40TCL_ZETA_PITCH								0x0000022c
+#define  NV40TCL_COLOR2_PITCH								0x00000280
+#define  NV40TCL_COLOR3_PITCH								0x00000284
+#define  NV40TCL_COLOR2_OFFSET								0x00000288
+#define  NV40TCL_COLOR3_OFFSET								0x0000028c
+#define  NV40TCL_VIEWPORT_CLIP_HORIZ(x)							(0x000002c0+((x)*8))
+#define  NV40TCL_VIEWPORT_CLIP_HORIZ__SIZE						0x00000008
+#define  NV40TCL_VIEWPORT_CLIP_VERT(x)							(0x000002c4+((x)*8))
+#define  NV40TCL_VIEWPORT_CLIP_VERT__SIZE						0x00000008
+#define  NV40TCL_DITHER_ENABLE								0x00000300
+#define  NV40TCL_ALPHA_TEST_ENABLE							0x00000304
+#define  NV40TCL_ALPHA_TEST_FUNC							0x00000308
+#define   NV40TCL_ALPHA_TEST_FUNC_NEVER							0x00000200
+#define   NV40TCL_ALPHA_TEST_FUNC_LESS							0x00000201
+#define   NV40TCL_ALPHA_TEST_FUNC_EQUAL							0x00000202
+#define   NV40TCL_ALPHA_TEST_FUNC_LEQUAL						0x00000203
+#define   NV40TCL_ALPHA_TEST_FUNC_GREATER						0x00000204
+#define   NV40TCL_ALPHA_TEST_FUNC_GREATER						0x00000204
+#define   NV40TCL_ALPHA_TEST_FUNC_NOTEQUAL						0x00000205
+#define   NV40TCL_ALPHA_TEST_FUNC_GEQUAL						0x00000206
+#define   NV40TCL_ALPHA_TEST_FUNC_ALWAYS						0x00000207
+#define  NV40TCL_ALPHA_TEST_REF								0x0000030c
+#define  NV40TCL_BLEND_ENABLE								0x00000310
+#define  NV40TCL_BLEND_FUNC_SRC								0x00000314
+#define   NV40TCL_BLEND_FUNC_SRC_RGB_SHIFT						0
+#define   NV40TCL_BLEND_FUNC_SRC_RGB_MASK						0x0000ffff
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_ZERO						0x00000000
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_ONE						0x00000001
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_SRC_COLOR						0x00000300
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_COLOR				0x00000301
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA						0x00000302
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_ALPHA				0x00000303
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_DST_ALPHA						0x00000304
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_ALPHA				0x00000305
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_DST_COLOR						0x00000306
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_COLOR				0x00000307
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA_SATURATE				0x00000308
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_CONSTANT_COLOR					0x00008001
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_CONSTANT_ALPHA					0x00008003
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define   NV40TCL_BLEND_FUNC_SRC_ALPHA_SHIFT						16
+#define   NV40TCL_BLEND_FUNC_SRC_ALPHA_MASK						0xffff0000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_ZERO						0x00000000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_ONE						0x00010000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_SRC_COLOR					0x03000000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_COLOR				0x03010000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA					0x03020000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_ALPHA				0x03030000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_DST_ALPHA					0x03040000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_ALPHA				0x03050000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_DST_COLOR					0x03060000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_COLOR				0x03070000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA_SATURATE				0x03080000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_COLOR					0x80010000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_COLOR			0x80020000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_ALPHA					0x80030000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_ALPHA			0x80040000
+#define  NV40TCL_BLEND_FUNC_DST								0x00000318
+#define   NV40TCL_BLEND_FUNC_DST_RGB_SHIFT						0
+#define   NV40TCL_BLEND_FUNC_DST_RGB_MASK						0x0000ffff
+#define    NV40TCL_BLEND_FUNC_DST_RGB_ZERO						0x00000000
+#define    NV40TCL_BLEND_FUNC_DST_RGB_ONE						0x00000001
+#define    NV40TCL_BLEND_FUNC_DST_RGB_SRC_COLOR						0x00000300
+#define    NV40TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_COLOR				0x00000301
+#define    NV40TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA						0x00000302
+#define    NV40TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_ALPHA				0x00000303
+#define    NV40TCL_BLEND_FUNC_DST_RGB_DST_ALPHA						0x00000304
+#define    NV40TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_ALPHA				0x00000305
+#define    NV40TCL_BLEND_FUNC_DST_RGB_DST_COLOR						0x00000306
+#define    NV40TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_COLOR				0x00000307
+#define    NV40TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA_SATURATE				0x00000308
+#define    NV40TCL_BLEND_FUNC_DST_RGB_CONSTANT_COLOR					0x00008001
+#define    NV40TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define    NV40TCL_BLEND_FUNC_DST_RGB_CONSTANT_ALPHA					0x00008003
+#define    NV40TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define   NV40TCL_BLEND_FUNC_DST_ALPHA_SHIFT						16
+#define   NV40TCL_BLEND_FUNC_DST_ALPHA_MASK						0xffff0000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_ZERO						0x00000000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_ONE						0x00010000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_SRC_COLOR					0x03000000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_COLOR				0x03010000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA					0x03020000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_ALPHA				0x03030000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_DST_ALPHA					0x03040000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_ALPHA				0x03050000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_DST_COLOR					0x03060000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_COLOR				0x03070000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA_SATURATE				0x03080000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_COLOR					0x80010000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_COLOR			0x80020000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_ALPHA					0x80030000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_ALPHA			0x80040000
+#define  NV40TCL_BLEND_COLOR								0x0000031c
+#define  NV40TCL_BLEND_EQUATION								0x00000320
+#define   NV40TCL_BLEND_EQUATION_RGB_SHIFT						0
+#define   NV40TCL_BLEND_EQUATION_RGB_MASK						0x0000ffff
+#define    NV40TCL_BLEND_EQUATION_RGB_FUNC_ADD						0x00008006
+#define    NV40TCL_BLEND_EQUATION_RGB_MIN						0x00008007
+#define    NV40TCL_BLEND_EQUATION_RGB_MAX						0x00008008
+#define    NV40TCL_BLEND_EQUATION_RGB_FUNC_SUBTRACT					0x0000800a
+#define    NV40TCL_BLEND_EQUATION_RGB_FUNC_REVERSE_SUBTRACT				0x0000800b
+#define   NV40TCL_BLEND_EQUATION_ALPHA_SHIFT						16
+#define   NV40TCL_BLEND_EQUATION_ALPHA_MASK						0xffff0000
+#define    NV40TCL_BLEND_EQUATION_ALPHA_FUNC_ADD					0x80060000
+#define    NV40TCL_BLEND_EQUATION_ALPHA_MIN						0x80070000
+#define    NV40TCL_BLEND_EQUATION_ALPHA_MAX						0x80080000
+#define    NV40TCL_BLEND_EQUATION_ALPHA_FUNC_SUBTRACT					0x800a0000
+#define    NV40TCL_BLEND_EQUATION_ALPHA_FUNC_REVERSE_SUBTRACT				0x800b0000
+#define  NV40TCL_COLOR_MASK								0x00000324
+#define   NV40TCL_COLOR_MASK_BUFFER0_B_SHIFT						0
+#define   NV40TCL_COLOR_MASK_BUFFER0_B_MASK						0x000000ff
+#define   NV40TCL_COLOR_MASK_BUFFER0_G_SHIFT						8
+#define   NV40TCL_COLOR_MASK_BUFFER0_G_MASK						0x0000ff00
+#define   NV40TCL_COLOR_MASK_BUFFER0_R_SHIFT						16
+#define   NV40TCL_COLOR_MASK_BUFFER0_R_MASK						0x00ff0000
+#define   NV40TCL_COLOR_MASK_BUFFER0_A_SHIFT						24
+#define   NV40TCL_COLOR_MASK_BUFFER0_A_MASK						0xff000000
+#define  NV40TCL_STENCIL_FRONT_ENABLE							0x00000328
+#define  NV40TCL_STENCIL_FRONT_MASK							0x0000032c
+#define  NV40TCL_STENCIL_FRONT_FUNC_FUNC						0x00000330
+#define   NV40TCL_STENCIL_FRONT_FUNC_FUNC_NEVER						0x00000200
+#define   NV40TCL_STENCIL_FRONT_FUNC_FUNC_LESS						0x00000201
+#define   NV40TCL_STENCIL_FRONT_FUNC_FUNC_EQUAL						0x00000202
+#define   NV40TCL_STENCIL_FRONT_FUNC_FUNC_LEQUAL					0x00000203
+#define   NV40TCL_STENCIL_FRONT_FUNC_FUNC_GREATER					0x00000204
+#define   NV40TCL_STENCIL_FRONT_FUNC_FUNC_GREATER					0x00000204
+#define   NV40TCL_STENCIL_FRONT_FUNC_FUNC_NOTEQUAL					0x00000205
+#define   NV40TCL_STENCIL_FRONT_FUNC_FUNC_GEQUAL					0x00000206
+#define   NV40TCL_STENCIL_FRONT_FUNC_FUNC_ALWAYS					0x00000207
+#define  NV40TCL_STENCIL_FRONT_FUNC_REF							0x00000334
+#define  NV40TCL_STENCIL_FRONT_FUNC_MASK						0x00000338
+#define  NV40TCL_STENCIL_FRONT_OP_FAIL							0x0000033c
+#define   NV40TCL_STENCIL_FRONT_OP_FAIL_ZERO						0x00000000
+#define   NV40TCL_STENCIL_FRONT_OP_FAIL_INVERT						0x0000150a
+#define   NV40TCL_STENCIL_FRONT_OP_FAIL_KEEP						0x00001e00
+#define   NV40TCL_STENCIL_FRONT_OP_FAIL_REPLACE						0x00001e01
+#define   NV40TCL_STENCIL_FRONT_OP_FAIL_INCR						0x00001e02
+#define   NV40TCL_STENCIL_FRONT_OP_FAIL_DECR						0x00001e03
+#define   NV40TCL_STENCIL_FRONT_OP_FAIL_INCR_WRAP					0x00008507
+#define   NV40TCL_STENCIL_FRONT_OP_FAIL_DECR_WRAP					0x00008508
+#define  NV40TCL_STENCIL_FRONT_OP_ZFAIL							0x00000340
+#define   NV40TCL_STENCIL_FRONT_OP_ZFAIL_ZERO						0x00000000
+#define   NV40TCL_STENCIL_FRONT_OP_ZFAIL_INVERT						0x0000150a
+#define   NV40TCL_STENCIL_FRONT_OP_ZFAIL_KEEP						0x00001e00
+#define   NV40TCL_STENCIL_FRONT_OP_ZFAIL_REPLACE					0x00001e01
+#define   NV40TCL_STENCIL_FRONT_OP_ZFAIL_INCR						0x00001e02
+#define   NV40TCL_STENCIL_FRONT_OP_ZFAIL_DECR						0x00001e03
+#define   NV40TCL_STENCIL_FRONT_OP_ZFAIL_INCR_WRAP					0x00008507
+#define   NV40TCL_STENCIL_FRONT_OP_ZFAIL_DECR_WRAP					0x00008508
+#define  NV40TCL_STENCIL_FRONT_OP_ZPASS							0x00000344
+#define   NV40TCL_STENCIL_FRONT_OP_ZPASS_ZERO						0x00000000
+#define   NV40TCL_STENCIL_FRONT_OP_ZPASS_INVERT						0x0000150a
+#define   NV40TCL_STENCIL_FRONT_OP_ZPASS_KEEP						0x00001e00
+#define   NV40TCL_STENCIL_FRONT_OP_ZPASS_REPLACE					0x00001e01
+#define   NV40TCL_STENCIL_FRONT_OP_ZPASS_INCR						0x00001e02
+#define   NV40TCL_STENCIL_FRONT_OP_ZPASS_DECR						0x00001e03
+#define   NV40TCL_STENCIL_FRONT_OP_ZPASS_INCR_WRAP					0x00008507
+#define   NV40TCL_STENCIL_FRONT_OP_ZPASS_DECR_WRAP					0x00008508
+#define  NV40TCL_STENCIL_BACK_ENABLE							0x00000348
+#define  NV40TCL_STENCIL_BACK_MASK							0x0000034c
+#define  NV40TCL_STENCIL_BACK_FUNC_FUNC							0x00000350
+#define   NV40TCL_STENCIL_BACK_FUNC_FUNC_NEVER						0x00000200
+#define   NV40TCL_STENCIL_BACK_FUNC_FUNC_LESS						0x00000201
+#define   NV40TCL_STENCIL_BACK_FUNC_FUNC_EQUAL						0x00000202
+#define   NV40TCL_STENCIL_BACK_FUNC_FUNC_LEQUAL						0x00000203
+#define   NV40TCL_STENCIL_BACK_FUNC_FUNC_GREATER					0x00000204
+#define   NV40TCL_STENCIL_BACK_FUNC_FUNC_GREATER					0x00000204
+#define   NV40TCL_STENCIL_BACK_FUNC_FUNC_NOTEQUAL					0x00000205
+#define   NV40TCL_STENCIL_BACK_FUNC_FUNC_GEQUAL						0x00000206
+#define   NV40TCL_STENCIL_BACK_FUNC_FUNC_ALWAYS						0x00000207
+#define  NV40TCL_STENCIL_BACK_FUNC_REF							0x00000354
+#define  NV40TCL_STENCIL_BACK_FUNC_MASK							0x00000358
+#define  NV40TCL_STENCIL_BACK_OP_FAIL							0x0000035c
+#define   NV40TCL_STENCIL_BACK_OP_FAIL_ZERO						0x00000000
+#define   NV40TCL_STENCIL_BACK_OP_FAIL_INVERT						0x0000150a
+#define   NV40TCL_STENCIL_BACK_OP_FAIL_KEEP						0x00001e00
+#define   NV40TCL_STENCIL_BACK_OP_FAIL_REPLACE						0x00001e01
+#define   NV40TCL_STENCIL_BACK_OP_FAIL_INCR						0x00001e02
+#define   NV40TCL_STENCIL_BACK_OP_FAIL_DECR						0x00001e03
+#define   NV40TCL_STENCIL_BACK_OP_FAIL_INCR_WRAP					0x00008507
+#define   NV40TCL_STENCIL_BACK_OP_FAIL_DECR_WRAP					0x00008508
+#define  NV40TCL_STENCIL_BACK_OP_ZFAIL							0x00000360
+#define   NV40TCL_STENCIL_BACK_OP_ZFAIL_ZERO						0x00000000
+#define   NV40TCL_STENCIL_BACK_OP_ZFAIL_INVERT						0x0000150a
+#define   NV40TCL_STENCIL_BACK_OP_ZFAIL_KEEP						0x00001e00
+#define   NV40TCL_STENCIL_BACK_OP_ZFAIL_REPLACE						0x00001e01
+#define   NV40TCL_STENCIL_BACK_OP_ZFAIL_INCR						0x00001e02
+#define   NV40TCL_STENCIL_BACK_OP_ZFAIL_DECR						0x00001e03
+#define   NV40TCL_STENCIL_BACK_OP_ZFAIL_INCR_WRAP					0x00008507
+#define   NV40TCL_STENCIL_BACK_OP_ZFAIL_DECR_WRAP					0x00008508
+#define  NV40TCL_STENCIL_BACK_OP_ZPASS							0x00000364
+#define   NV40TCL_STENCIL_BACK_OP_ZPASS_ZERO						0x00000000
+#define   NV40TCL_STENCIL_BACK_OP_ZPASS_INVERT						0x0000150a
+#define   NV40TCL_STENCIL_BACK_OP_ZPASS_KEEP						0x00001e00
+#define   NV40TCL_STENCIL_BACK_OP_ZPASS_REPLACE						0x00001e01
+#define   NV40TCL_STENCIL_BACK_OP_ZPASS_INCR						0x00001e02
+#define   NV40TCL_STENCIL_BACK_OP_ZPASS_DECR						0x00001e03
+#define   NV40TCL_STENCIL_BACK_OP_ZPASS_INCR_WRAP					0x00008507
+#define   NV40TCL_STENCIL_BACK_OP_ZPASS_DECR_WRAP					0x00008508
+#define  NV40TCL_SHADE_MODEL								0x00000368
+#define   NV40TCL_SHADE_MODEL_FLAT							0x00001d00
+#define   NV40TCL_SHADE_MODEL_SMOOTH							0x00001d01
+#define  NV40TCL_MRT_COLOR_MASK								0x00000370
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER1_A						(1 <<  4)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER1_R						(1 <<  5)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER1_G						(1 <<  6)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER1_B						(1 <<  7)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER2_A						(1 <<  8)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER2_R						(1 <<  9)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER2_G						(1 << 10)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER2_B						(1 << 11)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER3_A						(1 << 12)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER3_R						(1 << 13)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER3_G						(1 << 14)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER3_B						(1 << 15)
+#define  NV40TCL_COLOR_LOGIC_OP_ENABLE							0x00000374
+#define  NV40TCL_COLOR_LOGIC_OP								0x00000378
+#define   NV40TCL_COLOR_LOGIC_OP_CLEAR							0x00001500
+#define   NV40TCL_COLOR_LOGIC_OP_AND							0x00001501
+#define   NV40TCL_COLOR_LOGIC_OP_AND_REVERSE						0x00001502
+#define   NV40TCL_COLOR_LOGIC_OP_COPY							0x00001503
+#define   NV40TCL_COLOR_LOGIC_OP_AND_INVERTED						0x00001504
+#define   NV40TCL_COLOR_LOGIC_OP_NOOP							0x00001505
+#define   NV40TCL_COLOR_LOGIC_OP_XOR							0x00001506
+#define   NV40TCL_COLOR_LOGIC_OP_OR							0x00001507
+#define   NV40TCL_COLOR_LOGIC_OP_NOR							0x00001508
+#define   NV40TCL_COLOR_LOGIC_OP_EQUIV							0x00001509
+#define   NV40TCL_COLOR_LOGIC_OP_INVERT							0x0000150a
+#define   NV40TCL_COLOR_LOGIC_OP_OR_REVERSE						0x0000150b
+#define   NV40TCL_COLOR_LOGIC_OP_COPY_INVERTED						0x0000150c
+#define   NV40TCL_COLOR_LOGIC_OP_OR_INVERTED						0x0000150d
+#define   NV40TCL_COLOR_LOGIC_OP_NAND							0x0000150e
+#define   NV40TCL_COLOR_LOGIC_OP_SET							0x0000150f
+#define  NV40TCL_DEPTH_RANGE_NEAR							0x00000394
+#define  NV40TCL_DEPTH_RANGE_FAR							0x00000398
+#define  NV40TCL_LINE_WIDTH								0x000003b8
+#define  NV40TCL_LINE_SMOOTH_ENABLE							0x000003bc
+#define  NV40TCL_UNK03C0(x)								(0x000003c0+((x)*4))
+#define  NV40TCL_UNK03C0__SIZE								0x00000010
+#define  NV40TCL_UNK0400(x)								(0x00000400+((x)*4))
+#define  NV40TCL_UNK0400__SIZE								0x00000010
+#define  NV40TCL_UNK0440(x)								(0x00000440+((x)*4))
+#define  NV40TCL_UNK0440__SIZE								0x00000020
+#define  NV40TCL_SCISSOR_HORIZ								0x000008c0
+#define   NV40TCL_SCISSOR_HORIZ_X_SHIFT							0
+#define   NV40TCL_SCISSOR_HORIZ_X_MASK							0x0000ffff
+#define   NV40TCL_SCISSOR_HORIZ_W_SHIFT							16
+#define   NV40TCL_SCISSOR_HORIZ_W_MASK							0xffff0000
+#define  NV40TCL_SCISSOR_VERT								0x000008c4
+#define   NV40TCL_SCISSOR_VERT_Y_SHIFT							0
+#define   NV40TCL_SCISSOR_VERT_Y_MASK							0x0000ffff
+#define   NV40TCL_SCISSOR_VERT_H_SHIFT							16
+#define   NV40TCL_SCISSOR_VERT_H_MASK							0xffff0000
+#define  NV40TCL_FOG_MODE								0x000008cc
+#define  NV40TCL_FOG_EQUATION_CONSTANT							0x000008d0
+#define  NV40TCL_FOG_EQUATION_LINEAR							0x000008d4
+#define  NV40TCL_FOG_EQUATION_QUADRATIC							0x000008d8
+#define  NV40TCL_FP_ADDRESS								0x000008e4
+#define   NV40TCL_FP_ADDRESS_OFFSET_SHIFT						8
+#define   NV40TCL_FP_ADDRESS_OFFSET_MASK						0xffffff00
+#define   NV40TCL_FP_ADDRESS_DMA1							(1 <<  1)
+#define   NV40TCL_FP_ADDRESS_DMA0							(1 <<  0)
+#define  NV40TCL_VIEWPORT_HORIZ								0x00000a00
+#define   NV40TCL_VIEWPORT_HORIZ_W_SHIFT						16
+#define   NV40TCL_VIEWPORT_HORIZ_W_MASK							0xffff0000
+#define   NV40TCL_VIEWPORT_HORIZ_X_SHIFT						0
+#define   NV40TCL_VIEWPORT_HORIZ_X_MASK							0x0000ffff
+#define  NV40TCL_VIEWPORT_VERT								0x00000a04
+#define   NV40TCL_VIEWPORT_VERT_H_SHIFT							16
+#define   NV40TCL_VIEWPORT_VERT_H_MASK							0xffff0000
+#define   NV40TCL_VIEWPORT_VERT_Y_SHIFT							0
+#define   NV40TCL_VIEWPORT_VERT_Y_MASK							0x0000ffff
+#define  NV40TCL_VIEWPORT_TRANSLATE_X							0x00000a20
+#define  NV40TCL_VIEWPORT_TRANSLATE_Y							0x00000a24
+#define  NV40TCL_VIEWPORT_TRANSLATE_Z							0x00000a28
+#define  NV40TCL_VIEWPORT_TRANSLATE_W							0x00000a2c
+#define  NV40TCL_VIEWPORT_SCALE_X							0x00000a30
+#define  NV40TCL_VIEWPORT_SCALE_Y							0x00000a34
+#define  NV40TCL_VIEWPORT_SCALE_Z							0x00000a38
+#define  NV40TCL_VIEWPORT_SCALE_W							0x00000a3c
+#define  NV40TCL_POLYGON_OFFSET_POINT_ENABLE						0x00000a60
+#define  NV40TCL_POLYGON_OFFSET_LINE_ENABLE						0x00000a64
+#define  NV40TCL_POLYGON_OFFSET_FILL_ENABLE						0x00000a68
+#define  NV40TCL_DEPTH_FUNC								0x00000a6c
+#define   NV40TCL_DEPTH_FUNC_NEVER							0x00000200
+#define   NV40TCL_DEPTH_FUNC_LESS							0x00000201
+#define   NV40TCL_DEPTH_FUNC_EQUAL							0x00000202
+#define   NV40TCL_DEPTH_FUNC_LEQUAL							0x00000203
+#define   NV40TCL_DEPTH_FUNC_GREATER							0x00000204
+#define   NV40TCL_DEPTH_FUNC_GREATER							0x00000204
+#define   NV40TCL_DEPTH_FUNC_NOTEQUAL							0x00000205
+#define   NV40TCL_DEPTH_FUNC_GEQUAL							0x00000206
+#define   NV40TCL_DEPTH_FUNC_ALWAYS							0x00000207
+#define  NV40TCL_DEPTH_WRITE_ENABLE							0x00000a70
+#define  NV40TCL_DEPTH_TEST_ENABLE							0x00000a74
+#define  NV40TCL_POLYGON_OFFSET_FACTOR							0x00000a78
+#define  NV40TCL_POLYGON_OFFSET_UNITS							0x00000a7c
+#define  NV40TCL_VTX_ATTR_3I_XY(x)							(0x00000a80+((x)*8))
+#define  NV40TCL_VTX_ATTR_3I_XY__SIZE							0x00000010
+#define   NV40TCL_VTX_ATTR_3I_XY_X_SHIFT						0
+#define   NV40TCL_VTX_ATTR_3I_XY_X_MASK							0x0000ffff
+#define   NV40TCL_VTX_ATTR_3I_XY_Y_SHIFT						16
+#define   NV40TCL_VTX_ATTR_3I_XY_Y_MASK							0xffff0000
+#define  NV40TCL_VTX_ATTR_3I_Z(x)							(0x00000a84+((x)*8))
+#define  NV40TCL_VTX_ATTR_3I_Z__SIZE							0x00000010
+#define   NV40TCL_VTX_ATTR_3I_Z_Z_SHIFT							0
+#define   NV40TCL_VTX_ATTR_3I_Z_Z_MASK							0x0000ffff
+#define  NV40TCL_UNK0B40(x)								(0x00000b40+((x)*4))
+#define  NV40TCL_UNK0B40__SIZE								0x00000008
+#define  NV40TCL_VP_UPLOAD_INST(x)							(0x00000b80+((x)*4))
+#define  NV40TCL_VP_UPLOAD_INST__SIZE							0x00000004
+#define  NV40TCL_CLIP_PLANE_ENABLE							0x00001478
+#define   NV40TCL_CLIP_PLANE_ENABLE_PLANE0						(1 <<  1)
+#define   NV40TCL_CLIP_PLANE_ENABLE_PLANE1						(1 <<  5)
+#define   NV40TCL_CLIP_PLANE_ENABLE_PLANE2						(1 <<  9)
+#define   NV40TCL_CLIP_PLANE_ENABLE_PLANE3						(1 << 13)
+#define   NV40TCL_CLIP_PLANE_ENABLE_PLANE4						(1 << 17)
+#define   NV40TCL_CLIP_PLANE_ENABLE_PLANE5						(1 << 21)
+#define  NV40TCL_POLYGON_STIPPLE_ENABLE							0x0000147c
+#define  NV40TCL_POLYGON_STIPPLE_PATTERN(x)						(0x00001480+((x)*4))
+#define  NV40TCL_POLYGON_STIPPLE_PATTERN__SIZE						0x00000020
+#define  NV40TCL_VTX_ATTR_3F_X(x)							(0x00001500+((x)*16))
+#define  NV40TCL_VTX_ATTR_3F_X__SIZE							0x00000010
+#define  NV40TCL_VTX_ATTR_3F_Y(x)							(0x00001504+((x)*16))
+#define  NV40TCL_VTX_ATTR_3F_Y__SIZE							0x00000010
+#define  NV40TCL_VTX_ATTR_3F_Z(x)							(0x00001508+((x)*16))
+#define  NV40TCL_VTX_ATTR_3F_Z__SIZE							0x00000010
+#define  NV40TCL_VTXBUF_ADDRESS(x)							(0x00001680+((x)*4))
+#define  NV40TCL_VTXBUF_ADDRESS__SIZE							0x00000010
+#define   NV40TCL_VTXBUF_ADDRESS_DMA1							(1 << 31)
+#define   NV40TCL_VTXBUF_ADDRESS_OFFSET_SHIFT						0
+#define   NV40TCL_VTXBUF_ADDRESS_OFFSET_MASK						0x0fffffff
+#define  NV40TCL_VTX_CACHE_INVALIDATE							0x00001714
+#define  NV40TCL_VTXFMT(x)								(0x00001740+((x)*4))
+#define  NV40TCL_VTXFMT__SIZE								0x00000010
+#define   NV40TCL_VTXFMT_TYPE_SHIFT							0
+#define   NV40TCL_VTXFMT_TYPE_MASK							0x0000000f
+#define    NV40TCL_VTXFMT_TYPE_FLOAT							0x00000002
+#define    NV40TCL_VTXFMT_TYPE_UBYTE							0x00000004
+#define    NV40TCL_VTXFMT_TYPE_USHORT							0x00000005
+#define   NV40TCL_VTXFMT_SIZE_SHIFT							4
+#define   NV40TCL_VTXFMT_SIZE_MASK							0x000000f0
+#define   NV40TCL_VTXFMT_STRIDE_SHIFT							8
+#define   NV40TCL_VTXFMT_STRIDE_MASK							0x0000ff00
+#define  NV40TCL_QUERY_RESET								0x000017c8
+#define  NV40TCL_QUERY_UNK17CC								0x000017cc
+#define  NV40TCL_QUERY_GET								0x00001800
+#define   NV40TCL_QUERY_GET_UNK24_SHIFT							24
+#define   NV40TCL_QUERY_GET_UNK24_MASK							0xff000000
+#define   NV40TCL_QUERY_GET_OFFSET_SHIFT						0
+#define   NV40TCL_QUERY_GET_OFFSET_MASK							0x00ffffff
+#define  NV40TCL_BEGIN_END								0x00001808
+#define   NV40TCL_BEGIN_END_STOP							0x00000000
+#define   NV40TCL_BEGIN_END_POINTS							0x00000001
+#define   NV40TCL_BEGIN_END_LINES							0x00000002
+#define   NV40TCL_BEGIN_END_LINE_LOOP							0x00000003
+#define   NV40TCL_BEGIN_END_LINE_STRIP							0x00000004
+#define   NV40TCL_BEGIN_END_TRIANGLES							0x00000005
+#define   NV40TCL_BEGIN_END_TRIANGLE_STRIP						0x00000006
+#define   NV40TCL_BEGIN_END_TRIANGLE_FAN						0x00000007
+#define   NV40TCL_BEGIN_END_QUADS							0x00000008
+#define   NV40TCL_BEGIN_END_QUAD_STRIP							0x00000009
+#define   NV40TCL_BEGIN_END_POLYGON							0x0000000a
+#define  NV40TCL_VB_ELEMENT_U16								0x0000180c
+#define   NV40TCL_VB_ELEMENT_U16_1_SHIFT						16
+#define   NV40TCL_VB_ELEMENT_U16_1_MASK							0xffff0000
+#define   NV40TCL_VB_ELEMENT_U16_0_SHIFT						0
+#define   NV40TCL_VB_ELEMENT_U16_0_MASK							0x0000ffff
+#define  NV40TCL_VB_ELEMENT_U32								0x00001810
+#define  NV40TCL_VB_VERTEX_BATCH							0x00001814
+#define   NV40TCL_VB_VERTEX_BATCH_COUNT_SHIFT						24
+#define   NV40TCL_VB_VERTEX_BATCH_COUNT_MASK						0xff000000
+#define   NV40TCL_VB_VERTEX_BATCH_START_SHIFT						0
+#define   NV40TCL_VB_VERTEX_BATCH_START_MASK						0x00ffffff
+#define  NV40TCL_VERTEX_DATA								0x00001818
+#define  NV40TCL_IDXBUF_ADDRESS								0x0000181c
+#define  NV40TCL_IDXBUF_FORMAT								0x00001820
+#define   NV40TCL_IDXBUF_FORMAT_TYPE_SHIFT						4
+#define   NV40TCL_IDXBUF_FORMAT_TYPE_MASK						0x000000f0
+#define    NV40TCL_IDXBUF_FORMAT_TYPE_U32						0x00000000
+#define    NV40TCL_IDXBUF_FORMAT_TYPE_U16						0x00000010
+#define   NV40TCL_IDXBUF_FORMAT_DMA1							(1 <<  0)
+#define  NV40TCL_VB_INDEX_BATCH								0x00001824
+#define   NV40TCL_VB_INDEX_BATCH_COUNT_SHIFT						24
+#define   NV40TCL_VB_INDEX_BATCH_COUNT_MASK						0xff000000
+#define   NV40TCL_VB_INDEX_BATCH_START_SHIFT						0
+#define   NV40TCL_VB_INDEX_BATCH_START_MASK						0x00ffffff
+#define  NV40TCL_POLYGON_MODE_FRONT							0x00001828
+#define   NV40TCL_POLYGON_MODE_FRONT_POINT						0x00001b00
+#define   NV40TCL_POLYGON_MODE_FRONT_LINE						0x00001b01
+#define   NV40TCL_POLYGON_MODE_FRONT_FILL						0x00001b02
+#define  NV40TCL_POLYGON_MODE_BACK							0x0000182c
+#define   NV40TCL_POLYGON_MODE_BACK_POINT						0x00001b00
+#define   NV40TCL_POLYGON_MODE_BACK_LINE						0x00001b01
+#define   NV40TCL_POLYGON_MODE_BACK_FILL						0x00001b02
+#define  NV40TCL_CULL_FACE								0x00001830
+#define   NV40TCL_CULL_FACE_FRONT							0x00000404
+#define   NV40TCL_CULL_FACE_BACK							0x00000405
+#define   NV40TCL_CULL_FACE_FRONT_AND_BACK						0x00000408
+#define  NV40TCL_FRONT_FACE								0x00001834
+#define   NV40TCL_FRONT_FACE_CW								0x00000900
+#define   NV40TCL_FRONT_FACE_CCW							0x00000901
+#define  NV40TCL_POLYGON_SMOOTH_ENABLE							0x00001838
+#define  NV40TCL_CULL_FACE_ENABLE							0x0000183c
+#define  NV40TCL_TEX_SIZE1(x)								(0x00001840+((x)*4))
+#define  NV40TCL_TEX_SIZE1__SIZE							0x00000008
+#define   NV40TCL_TEX_SIZE1_DEPTH_SHIFT							20
+#define   NV40TCL_TEX_SIZE1_DEPTH_MASK							0xfff00000
+#define   NV40TCL_TEX_SIZE1_PITCH_SHIFT							0
+#define   NV40TCL_TEX_SIZE1_PITCH_MASK							0x0000ffff
+#define  NV40TCL_VTX_ATTR_2F_X(x)							(0x00001880+((x)*8))
+#define  NV40TCL_VTX_ATTR_2F_X__SIZE							0x00000010
+#define  NV40TCL_VTX_ATTR_2F_Y(x)							(0x00001884+((x)*8))
+#define  NV40TCL_VTX_ATTR_2F_Y__SIZE							0x00000010
+#define  NV40TCL_VTX_ATTR_2I(x)								(0x00001900+((x)*4))
+#define  NV40TCL_VTX_ATTR_2I__SIZE							0x00000010
+#define   NV40TCL_VTX_ATTR_2I_X_SHIFT							0
+#define   NV40TCL_VTX_ATTR_2I_X_MASK							0x0000ffff
+#define   NV40TCL_VTX_ATTR_2I_Y_SHIFT							16
+#define   NV40TCL_VTX_ATTR_2I_Y_MASK							0xffff0000
+#define  NV40TCL_VTX_ATTR_4UB(x)							(0x00001940+((x)*4))
+#define  NV40TCL_VTX_ATTR_4UB__SIZE							0x00000010
+#define   NV40TCL_VTX_ATTR_4UB_X_SHIFT							0
+#define   NV40TCL_VTX_ATTR_4UB_X_MASK							0x000000ff
+#define   NV40TCL_VTX_ATTR_4UB_Y_SHIFT							8
+#define   NV40TCL_VTX_ATTR_4UB_Y_MASK							0x0000ff00
+#define   NV40TCL_VTX_ATTR_4UB_Z_SHIFT							16
+#define   NV40TCL_VTX_ATTR_4UB_Z_MASK							0x00ff0000
+#define   NV40TCL_VTX_ATTR_4UB_W_SHIFT							24
+#define   NV40TCL_VTX_ATTR_4UB_W_MASK							0xff000000
+#define  NV40TCL_VTX_ATTR_4I_XY(x)							(0x00001980+((x)*8))
+#define  NV40TCL_VTX_ATTR_4I_XY__SIZE							0x00000010
+#define   NV40TCL_VTX_ATTR_4I_XY_X_SHIFT						0
+#define   NV40TCL_VTX_ATTR_4I_XY_X_MASK							0x0000ffff
+#define   NV40TCL_VTX_ATTR_4I_XY_Y_SHIFT						16
+#define   NV40TCL_VTX_ATTR_4I_XY_Y_MASK							0xffff0000
+#define  NV40TCL_VTX_ATTR_4I_ZW(x)							(0x00001984+((x)*8))
+#define  NV40TCL_VTX_ATTR_4I_ZW__SIZE							0x00000010
+#define   NV40TCL_VTX_ATTR_4I_ZW_Z_SHIFT						0
+#define   NV40TCL_VTX_ATTR_4I_ZW_Z_MASK							0x0000ffff
+#define   NV40TCL_VTX_ATTR_4I_ZW_W_SHIFT						16
+#define   NV40TCL_VTX_ATTR_4I_ZW_W_MASK							0xffff0000
+#define  NV40TCL_TEX_OFFSET(x)								(0x00001a00+((x)*32))
+#define  NV40TCL_TEX_OFFSET__SIZE							0x00000010
+#define  NV40TCL_TEX_FORMAT(x)								(0x00001a04+((x)*32))
+#define  NV40TCL_TEX_FORMAT__SIZE							0x00000010
+#define   NV40TCL_TEX_FORMAT_MIPMAP_COUNT_SHIFT						16
+#define   NV40TCL_TEX_FORMAT_MIPMAP_COUNT_MASK						0x000f0000
+#define   NV40TCL_TEX_FORMAT_RECT							(1 << 14)
+#define   NV40TCL_TEX_FORMAT_LINEAR							(1 << 13)
+#define   NV40TCL_TEX_FORMAT_FORMAT_SHIFT						8
+#define   NV40TCL_TEX_FORMAT_FORMAT_MASK						0x00001f00
+#define    NV40TCL_TEX_FORMAT_FORMAT_L8							0x00000100
+#define    NV40TCL_TEX_FORMAT_FORMAT_A1R5G5B5						0x00000200
+#define    NV40TCL_TEX_FORMAT_FORMAT_A4R4G4B4						0x00000300
+#define    NV40TCL_TEX_FORMAT_FORMAT_R5G6B5						0x00000400
+#define    NV40TCL_TEX_FORMAT_FORMAT_A8R8G8B8						0x00000500
+#define    NV40TCL_TEX_FORMAT_FORMAT_DXT1						0x00000600
+#define    NV40TCL_TEX_FORMAT_FORMAT_DXT3						0x00000700
+#define    NV40TCL_TEX_FORMAT_FORMAT_DXT5						0x00000800
+#define    NV40TCL_TEX_FORMAT_FORMAT_A8L8						0x00000b00
+#define    NV40TCL_TEX_FORMAT_FORMAT_Z24						0x00001000
+#define    NV40TCL_TEX_FORMAT_FORMAT_Z16						0x00001200
+#define    NV40TCL_TEX_FORMAT_FORMAT_A16                                                0x00001400
+#define    NV40TCL_TEX_FORMAT_FORMAT_HILO8						0x00001800
+#define    NV40TCL_TEX_FORMAT_FORMAT_RGBA16F						0x00001a00
+#define    NV40TCL_TEX_FORMAT_FORMAT_RGBA32F						0x00001b00
+#define   NV40TCL_TEX_FORMAT_DIMS_SHIFT							4
+#define   NV40TCL_TEX_FORMAT_DIMS_MASK							0x000000f0
+#define    NV40TCL_TEX_FORMAT_DIMS_1D							0x00000010
+#define    NV40TCL_TEX_FORMAT_DIMS_2D							0x00000020
+#define    NV40TCL_TEX_FORMAT_DIMS_3D							0x00000030
+#define   NV40TCL_TEX_FORMAT_NO_BORDER							(1 <<  3)
+#define   NV40TCL_TEX_FORMAT_CUBIC							(1 <<  2)
+#define   NV40TCL_TEX_FORMAT_DMA1							(1 <<  1)
+#define   NV40TCL_TEX_FORMAT_DMA0							(1 <<  0)
+#define  NV40TCL_TEX_WRAP(x)								(0x00001a08+((x)*32))
+#define  NV40TCL_TEX_WRAP__SIZE								0x00000010
+#define   NV40TCL_TEX_WRAP_S_SHIFT							0
+#define   NV40TCL_TEX_WRAP_S_MASK							0x000000ff
+#define    NV40TCL_TEX_WRAP_S_REPEAT							0x00000001
+#define    NV40TCL_TEX_WRAP_S_MIRRORED_REPEAT						0x00000002
+#define    NV40TCL_TEX_WRAP_S_CLAMP_TO_EDGE						0x00000003
+#define    NV40TCL_TEX_WRAP_S_CLAMP_TO_BORDER						0x00000004
+#define    NV40TCL_TEX_WRAP_S_CLAMP							0x00000005
+#define    NV40TCL_TEX_WRAP_S_MIRROR_CLAMP_TO_EDGE					0x00000006
+#define    NV40TCL_TEX_WRAP_S_MIRROR_CLAMP_TO_BORDER					0x00000007
+#define    NV40TCL_TEX_WRAP_S_MIRROR_CLAMP						0x00000008
+#define   NV40TCL_TEX_WRAP_T_SHIFT							8
+#define   NV40TCL_TEX_WRAP_T_MASK							0x0000ff00
+#define    NV40TCL_TEX_WRAP_T_REPEAT							0x00000100
+#define    NV40TCL_TEX_WRAP_T_MIRRORED_REPEAT						0x00000200
+#define    NV40TCL_TEX_WRAP_T_CLAMP_TO_EDGE						0x00000300
+#define    NV40TCL_TEX_WRAP_T_CLAMP_TO_BORDER						0x00000400
+#define    NV40TCL_TEX_WRAP_T_CLAMP							0x00000500
+#define    NV40TCL_TEX_WRAP_T_MIRROR_CLAMP_TO_EDGE					0x00000600
+#define    NV40TCL_TEX_WRAP_T_MIRROR_CLAMP_TO_BORDER					0x00000700
+#define    NV40TCL_TEX_WRAP_T_MIRROR_CLAMP						0x00000800
+#define   NV40TCL_TEX_WRAP_R_SHIFT							16
+#define   NV40TCL_TEX_WRAP_R_MASK							0x00ff0000
+#define    NV40TCL_TEX_WRAP_R_REPEAT							0x00010000
+#define    NV40TCL_TEX_WRAP_R_MIRRORED_REPEAT						0x00020000
+#define    NV40TCL_TEX_WRAP_R_CLAMP_TO_EDGE						0x00030000
+#define    NV40TCL_TEX_WRAP_R_CLAMP_TO_BORDER						0x00040000
+#define    NV40TCL_TEX_WRAP_R_CLAMP							0x00050000
+#define    NV40TCL_TEX_WRAP_R_MIRROR_CLAMP_TO_EDGE					0x00060000
+#define    NV40TCL_TEX_WRAP_R_MIRROR_CLAMP_TO_BORDER					0x00070000
+#define    NV40TCL_TEX_WRAP_R_MIRROR_CLAMP						0x00080000
+#define   NV40TCL_TEX_WRAP_RCOMP_SHIFT							28
+#define   NV40TCL_TEX_WRAP_RCOMP_MASK							0xf0000000
+#define    NV40TCL_TEX_WRAP_RCOMP_NEVER							0x00000000
+#define    NV40TCL_TEX_WRAP_RCOMP_GREATER						0x10000000
+#define    NV40TCL_TEX_WRAP_RCOMP_EQUAL							0x20000000
+#define    NV40TCL_TEX_WRAP_RCOMP_GEQUAL						0x30000000
+#define    NV40TCL_TEX_WRAP_RCOMP_LESS							0x40000000
+#define    NV40TCL_TEX_WRAP_RCOMP_NOTEQUAL						0x50000000
+#define    NV40TCL_TEX_WRAP_RCOMP_LEQUAL						0x60000000
+#define    NV40TCL_TEX_WRAP_RCOMP_ALWAYS						0x70000000
+#define  NV40TCL_TEX_ENABLE(x)								(0x00001a0c+((x)*32))
+#define  NV40TCL_TEX_ENABLE__SIZE							0x00000010
+#define   NV40TCL_TEX_ENABLE_ENABLE							(1 << 31)
+#define   NV40TCL_TEX_ENABLE_ANISO_SHIFT						4
+#define   NV40TCL_TEX_ENABLE_ANISO_MASK							0x000000f0
+#define    NV40TCL_TEX_ENABLE_ANISO_NONE						0x00000000
+#define    NV40TCL_TEX_ENABLE_ANISO_2X							0x00000010
+#define    NV40TCL_TEX_ENABLE_ANISO_4X							0x00000020
+#define    NV40TCL_TEX_ENABLE_ANISO_6X							0x00000030
+#define    NV40TCL_TEX_ENABLE_ANISO_8X							0x00000040
+#define    NV40TCL_TEX_ENABLE_ANISO_10X							0x00000050
+#define    NV40TCL_TEX_ENABLE_ANISO_12X							0x00000060
+#define    NV40TCL_TEX_ENABLE_ANISO_16X							0x00000070
+#define  NV40TCL_TEX_SWIZZLE(x)								(0x00001a10+((x)*32))
+#define  NV40TCL_TEX_SWIZZLE__SIZE							0x00000010
+#define   NV40TCL_TEX_SWIZZLE_S0_X_SHIFT						14
+#define   NV40TCL_TEX_SWIZZLE_S0_X_MASK							0x0000c000
+#define    NV40TCL_TEX_SWIZZLE_S0_X_ZERO						0x00000000
+#define    NV40TCL_TEX_SWIZZLE_S0_X_ONE							0x00004000
+#define    NV40TCL_TEX_SWIZZLE_S0_X_S1							0x00008000
+#define   NV40TCL_TEX_SWIZZLE_S0_Y_SHIFT						12
+#define   NV40TCL_TEX_SWIZZLE_S0_Y_MASK							0x00003000
+#define    NV40TCL_TEX_SWIZZLE_S0_Y_ZERO						0x00000000
+#define    NV40TCL_TEX_SWIZZLE_S0_Y_ONE							0x00001000
+#define    NV40TCL_TEX_SWIZZLE_S0_Y_S1							0x00002000
+#define   NV40TCL_TEX_SWIZZLE_S0_Z_SHIFT						10
+#define   NV40TCL_TEX_SWIZZLE_S0_Z_MASK							0x00000c00
+#define    NV40TCL_TEX_SWIZZLE_S0_Z_ZERO						0x00000000
+#define    NV40TCL_TEX_SWIZZLE_S0_Z_ONE							0x00000400
+#define    NV40TCL_TEX_SWIZZLE_S0_Z_S1							0x00000800
+#define   NV40TCL_TEX_SWIZZLE_S0_W_SHIFT						8
+#define   NV40TCL_TEX_SWIZZLE_S0_W_MASK							0x00000300
+#define    NV40TCL_TEX_SWIZZLE_S0_W_ZERO						0x00000000
+#define    NV40TCL_TEX_SWIZZLE_S0_W_ONE							0x00000100
+#define    NV40TCL_TEX_SWIZZLE_S0_W_S1							0x00000200
+#define   NV40TCL_TEX_SWIZZLE_S1_X_SHIFT						6
+#define   NV40TCL_TEX_SWIZZLE_S1_X_MASK							0x000000c0
+#define    NV40TCL_TEX_SWIZZLE_S1_X_W							0x00000000
+#define    NV40TCL_TEX_SWIZZLE_S1_X_Z							0x00000040
+#define    NV40TCL_TEX_SWIZZLE_S1_X_Y							0x00000080
+#define    NV40TCL_TEX_SWIZZLE_S1_X_X							0x000000c0
+#define   NV40TCL_TEX_SWIZZLE_S1_Y_SHIFT						4
+#define   NV40TCL_TEX_SWIZZLE_S1_Y_MASK							0x00000030
+#define    NV40TCL_TEX_SWIZZLE_S1_Y_W							0x00000000
+#define    NV40TCL_TEX_SWIZZLE_S1_Y_Z							0x00000010
+#define    NV40TCL_TEX_SWIZZLE_S1_Y_Y							0x00000020
+#define    NV40TCL_TEX_SWIZZLE_S1_Y_X							0x00000030
+#define   NV40TCL_TEX_SWIZZLE_S1_Z_SHIFT						2
+#define   NV40TCL_TEX_SWIZZLE_S1_Z_MASK							0x0000000c
+#define    NV40TCL_TEX_SWIZZLE_S1_Z_W							0x00000000
+#define    NV40TCL_TEX_SWIZZLE_S1_Z_Z							0x00000004
+#define    NV40TCL_TEX_SWIZZLE_S1_Z_Y							0x00000008
+#define    NV40TCL_TEX_SWIZZLE_S1_Z_X							0x0000000c
+#define   NV40TCL_TEX_SWIZZLE_S1_W_SHIFT						0
+#define   NV40TCL_TEX_SWIZZLE_S1_W_MASK							0x00000003
+#define    NV40TCL_TEX_SWIZZLE_S1_W_W							0x00000000
+#define    NV40TCL_TEX_SWIZZLE_S1_W_Z							0x00000001
+#define    NV40TCL_TEX_SWIZZLE_S1_W_Y							0x00000002
+#define    NV40TCL_TEX_SWIZZLE_S1_W_X							0x00000003
+#define  NV40TCL_TEX_FILTER(x)								(0x00001a14+((x)*32))
+#define  NV40TCL_TEX_FILTER__SIZE							0x00000010
+#define   NV40TCL_TEX_FILTER_SIGNED_ALPHA						(1 << 31)
+#define   NV40TCL_TEX_FILTER_SIGNED_RED							(1 << 30)
+#define   NV40TCL_TEX_FILTER_SIGNED_GREEN						(1 << 29)
+#define   NV40TCL_TEX_FILTER_SIGNED_BLUE						(1 << 28)
+#define   NV40TCL_TEX_FILTER_MIN_SHIFT							16
+#define   NV40TCL_TEX_FILTER_MIN_MASK							0x000f0000
+#define    NV40TCL_TEX_FILTER_MIN_NEAREST						0x00010000
+#define    NV40TCL_TEX_FILTER_MIN_LINEAR						0x00020000
+#define    NV40TCL_TEX_FILTER_MIN_NEAREST_MIPMAP_NEAREST				0x00030000
+#define    NV40TCL_TEX_FILTER_MIN_LINEAR_MIPMAP_NEAREST					0x00040000
+#define    NV40TCL_TEX_FILTER_MIN_NEAREST_MIPMAP_LINEAR					0x00050000
+#define    NV40TCL_TEX_FILTER_MIN_LINEAR_MIPMAP_LINEAR					0x00060000
+#define   NV40TCL_TEX_FILTER_MAG_SHIFT							24
+#define   NV40TCL_TEX_FILTER_MAG_MASK							0x0f000000
+#define    NV40TCL_TEX_FILTER_MAG_NEAREST						0x01000000
+#define    NV40TCL_TEX_FILTER_MAG_LINEAR						0x02000000
+#define  NV40TCL_TEX_SIZE0(x)								(0x00001a18+((x)*32))
+#define  NV40TCL_TEX_SIZE0__SIZE							0x00000010
+#define   NV40TCL_TEX_SIZE0_H_SHIFT							0
+#define   NV40TCL_TEX_SIZE0_H_MASK							0x0000ffff
+#define   NV40TCL_TEX_SIZE0_W_SHIFT							16
+#define   NV40TCL_TEX_SIZE0_W_MASK							0xffff0000
+#define  NV40TCL_TEX_BORDER_COLOR(x)							(0x00001a1c+((x)*32))
+#define  NV40TCL_TEX_BORDER_COLOR__SIZE							0x00000010
+#define   NV40TCL_TEX_BORDER_COLOR_B_SHIFT						0
+#define   NV40TCL_TEX_BORDER_COLOR_B_MASK						0x000000ff
+#define   NV40TCL_TEX_BORDER_COLOR_G_SHIFT						8
+#define   NV40TCL_TEX_BORDER_COLOR_G_MASK						0x0000ff00
+#define   NV40TCL_TEX_BORDER_COLOR_R_SHIFT						16
+#define   NV40TCL_TEX_BORDER_COLOR_R_MASK						0x00ff0000
+#define   NV40TCL_TEX_BORDER_COLOR_A_SHIFT						24
+#define   NV40TCL_TEX_BORDER_COLOR_A_MASK						0xff000000
+#define  NV40TCL_VTX_ATTR_4F_X(x)							(0x00001c00+((x)*16))
+#define  NV40TCL_VTX_ATTR_4F_X__SIZE							0x00000010
+#define  NV40TCL_VTX_ATTR_4F_Y(x)							(0x00001c04+((x)*16))
+#define  NV40TCL_VTX_ATTR_4F_Y__SIZE							0x00000010
+#define  NV40TCL_VTX_ATTR_4F_Z(x)							(0x00001c08+((x)*16))
+#define  NV40TCL_VTX_ATTR_4F_Z__SIZE							0x00000010
+#define  NV40TCL_VTX_ATTR_4F_W(x)							(0x00001c0c+((x)*16))
+#define  NV40TCL_VTX_ATTR_4F_W__SIZE							0x00000010
+#define  NV40TCL_FP_CONTROL								0x00001d60
+#define   NV40TCL_FP_CONTROL_TEMP_COUNT_SHIFT						24
+#define   NV40TCL_FP_CONTROL_TEMP_COUNT_MASK						0xff000000
+#define   NV40TCL_FP_CONTROL_KIL							(1 <<  7)
+#define  NV40TCL_MULTISAMPLE_CONTROL							0x00001d7c
+#define  NV40TCL_CLEAR_VALUE_DEPTH							0x00001d8c
+#define  NV40TCL_CLEAR_VALUE_COLOR							0x00001d90
+#define  NV40TCL_CLEAR_BUFFERS								0x00001d94
+#define   NV40TCL_CLEAR_BUFFERS_COLOR_A							(1 <<  7)
+#define   NV40TCL_CLEAR_BUFFERS_COLOR_B							(1 <<  6)
+#define   NV40TCL_CLEAR_BUFFERS_COLOR_G							(1 <<  5)
+#define   NV40TCL_CLEAR_BUFFERS_COLOR_R							(1 <<  4)
+#define   NV40TCL_CLEAR_BUFFERS_STENCIL							(1 <<  1)
+#define   NV40TCL_CLEAR_BUFFERS_DEPTH							(1 <<  0)
+#define  NV40TCL_LINE_STIPPLE_ENABLE							0x00001db4
+#define  NV40TCL_LINE_STIPPLE_PATTERN							0x00001db8
+#define   NV40TCL_LINE_STIPPLE_PATTERN_FACTOR_SHIFT					0
+#define   NV40TCL_LINE_STIPPLE_PATTERN_FACTOR_MASK					0x0000ffff
+#define   NV40TCL_LINE_STIPPLE_PATTERN_PATTERN_SHIFT					16
+#define   NV40TCL_LINE_STIPPLE_PATTERN_PATTERN_MASK					0xffff0000
+#define  NV40TCL_VTX_ATTR_1F(x)								(0x00001e40+((x)*4))
+#define  NV40TCL_VTX_ATTR_1F__SIZE							0x00000010
+#define  NV40TCL_VP_UPLOAD_FROM_ID							0x00001e9c
+#define  NV40TCL_VP_START_FROM_ID							0x00001ea0
+#define  NV40TCL_POINT_SIZE								0x00001ee0
+#define  NV40TCL_POINT_SPRITE								0x00001ee8
+#define  NV40TCL_VP_UPLOAD_CONST_ID							0x00001efc
+#define  NV40TCL_VP_UPLOAD_CONST_X(x)							(0x00001f00+((x)*16))
+#define  NV40TCL_VP_UPLOAD_CONST_X__SIZE						0x00000004
+#define  NV40TCL_VP_UPLOAD_CONST_Y(x)							(0x00001f04+((x)*16))
+#define  NV40TCL_VP_UPLOAD_CONST_Y__SIZE						0x00000004
+#define  NV40TCL_VP_UPLOAD_CONST_Z(x)							(0x00001f08+((x)*16))
+#define  NV40TCL_VP_UPLOAD_CONST_Z__SIZE						0x00000004
+#define  NV40TCL_VP_UPLOAD_CONST_W(x)							(0x00001f0c+((x)*16))
+#define  NV40TCL_VP_UPLOAD_CONST_W__SIZE						0x00000004
+#define  NV40TCL_TEX_CACHE_CTL								0x00001fd8
+#define  NV40TCL_VP_ATTRIB_EN								0x00001ff0
+#define  NV40TCL_VP_RESULT_EN								0x00001ff4
+
+
+#define NV44TCL										0x00004497
+
+
+
+#define NV50_2D										0x0000502d
+
+#define  NV50_2D_NOP									0x00000100
+#define  NV50_2D_NOTIFY									0x00000104
+#define  NV50_2D_DMA_NOTIFY								0x00000180
+#define  NV50_2D_DMA_IN_MEMORY0								0x00000184
+#define  NV50_2D_DMA_IN_MEMORY1								0x00000188
+#define  NV50_2D_DMA_IN_MEMORY2								0x0000018c
+#define  NV50_2D_DST_FORMAT								0x00000200
+#define   NV50_2D_DST_FORMAT_32BPP							0x000000cf
+#define   NV50_2D_DST_FORMAT_24BPP							0x000000e6
+#define   NV50_2D_DST_FORMAT_16BPP							0x000000e8
+#define   NV50_2D_DST_FORMAT_8BPP							0x000000f3
+#define   NV50_2D_DST_FORMAT_15BPP							0x000000f8
+#define  NV50_2D_DST_PITCH								0x00000214
+#define  NV50_2D_DST_WIDTH								0x00000218
+#define  NV50_2D_DST_HEIGHT								0x0000021c
+#define  NV50_2D_DST_ADDRESS_HIGH							0x00000220
+#define  NV50_2D_DST_ADDRESS_LOW							0x00000224
+#define  NV50_2D_SRC_FORMAT								0x00000230
+#define   NV50_2D_SRC_FORMAT_32BPP							0x000000cf
+#define   NV50_2D_SRC_FORMAT_24BPP							0x000000e6
+#define   NV50_2D_SRC_FORMAT_16BPP							0x000000e8
+#define   NV50_2D_SRC_FORMAT_8BPP							0x000000f3
+#define   NV50_2D_SRC_FORMAT_15BPP							0x000000f8
+#define  NV50_2D_SRC_PITCH								0x00000244
+#define  NV50_2D_SRC_WIDTH								0x00000248
+#define  NV50_2D_SRC_HEIGHT								0x0000024c
+#define  NV50_2D_SRC_ADDRESS_HIGH							0x00000250
+#define  NV50_2D_SRC_ADDRESS_LOW							0x00000254
+#define  NV50_2D_CLIP_X									0x00000280
+#define  NV50_2D_CLIP_Y									0x00000284
+#define  NV50_2D_CLIP_Z									0x00000288
+#define  NV50_2D_CLIP_W									0x0000028c
+#define  NV50_2D_ROP									0x000002a0
+#define  NV50_2D_OPERATION								0x000002ac
+#define   NV50_2D_OPERATION_SRCCOPY_AND							0x00000000
+#define   NV50_2D_OPERATION_ROP_AND							0x00000001
+#define   NV50_2D_OPERATION_BLEND_AND							0x00000002
+#define   NV50_2D_OPERATION_SRCCOPY							0x00000003
+#define   NV50_2D_OPERATION_SRCCOPY_PREMULT						0x00000004
+#define   NV50_2D_OPERATION_BLEND_PREMULT						0x00000005
+#define  NV50_2D_PATTERN_FORMAT								0x000002e8
+#define   NV50_2D_PATTERN_FORMAT_16BPP							0x00000000
+#define   NV50_2D_PATTERN_FORMAT_15BPP							0x00000001
+#define   NV50_2D_PATTERN_FORMAT_32BPP							0x00000002
+#define   NV50_2D_PATTERN_FORMAT_8BPP							0x00000003
+#define  NV50_2D_PATTERN_COLOR(x)							(0x000002f0+((x)*4))
+#define  NV50_2D_PATTERN_COLOR__SIZE							0x00000002
+#define  NV50_2D_PATTERN_BITMAP(x)							(0x000002f8+((x)*4))
+#define  NV50_2D_PATTERN_BITMAP__SIZE							0x00000002
+#define  NV50_2D_RECT_FORMAT								0x00000584
+#define   NV50_2D_RECT_FORMAT_32BPP							0x000000cf
+#define   NV50_2D_RECT_FORMAT_24BPP							0x000000e6
+#define   NV50_2D_RECT_FORMAT_16BPP							0x000000e8
+#define   NV50_2D_RECT_FORMAT_8BPP							0x000000f3
+#define   NV50_2D_RECT_FORMAT_15BPP							0x000000f8
+#define  NV50_2D_RECT_COLOR								0x00000588
+#define  NV50_2D_RECT_X1								0x00000600
+#define  NV50_2D_RECT_Y1								0x00000604
+#define  NV50_2D_RECT_X2								0x00000608
+#define  NV50_2D_RECT_Y2								0x0000060c
+#define  NV50_2D_SIFC_UNK0800								0x00000800
+#define  NV50_2D_SIFC_FORMAT								0x00000804
+#define   NV50_2D_SIFC_FORMAT_32BPP							0x000000cf
+#define   NV50_2D_SIFC_FORMAT_24BPP							0x000000e6
+#define   NV50_2D_SIFC_FORMAT_16BPP							0x000000e8
+#define   NV50_2D_SIFC_FORMAT_8BPP							0x000000f3
+#define   NV50_2D_SIFC_FORMAT_15BPP							0x000000f8
+#define  NV50_2D_SIFC_WIDTH								0x00000838
+#define  NV50_2D_SIFC_HEIGHT								0x0000083c
+#define  NV50_2D_SIFC_SCALE_UNK0840							0x00000840
+#define  NV50_2D_SIFC_SCALE_UNK0844							0x00000844
+#define  NV50_2D_SIFC_SCALE_UNK0848							0x00000848
+#define  NV50_2D_SIFC_SCALE_UNK084C							0x0000084c
+#define  NV50_2D_SIFC_UNK0850								0x00000850
+#define  NV50_2D_SIFC_DST_X								0x00000854
+#define  NV50_2D_SIFC_UNK0858								0x00000858
+#define  NV50_2D_SIFC_DST_Y								0x0000085c
+#define  NV50_2D_SIFC_DATA								0x00000860
+#define  NV50_2D_BLIT_DST_X								0x000008b0
+#define  NV50_2D_BLIT_DST_Y								0x000008b4
+#define  NV50_2D_BLIT_DST_W								0x000008b8
+#define  NV50_2D_BLIT_DST_H								0x000008bc
+#define  NV50_2D_BLIT_SRC_X								0x000008d4
+#define  NV50_2D_BLIT_SRC_Y								0x000008dc
+
+
+#define NV50_MEMORY_TO_MEMORY_FORMAT							0x00005039
+
+#define  NV50_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN_HIGH					0x00000238
+#define  NV50_MEMORY_TO_MEMORY_FORMAT_OFFSET_OUT_HIGH					0x0000023c
+
+
+#define NV50TCL										0x00005097
+
+#define  NV50TCL_NOP									0x00000100
+#define  NV50TCL_NOTIFY									0x00000104
+#define  NV50TCL_DMA_NOTIFY								0x00000180
+#define  NV50TCL_DMA_UNK0(x)								(0x00000184+((x)*4))
+#define  NV50TCL_DMA_UNK0__SIZE								0x0000000b
+#define  NV50TCL_DMA_UNK1(x)								(0x000001c0+((x)*4))
+#define  NV50TCL_DMA_UNK1__SIZE								0x00000008
+#define  NV50TCL_RT_ADDRESS_HIGH(x)							(0x00000200+((x)*32))
+#define  NV50TCL_RT_ADDRESS_HIGH__SIZE							0x00000008
+#define  NV50TCL_RT_ADDRESS_LOW(x)							(0x00000204+((x)*32))
+#define  NV50TCL_RT_ADDRESS_LOW__SIZE							0x00000008
+#define  NV50TCL_RT_FORMAT(x)								(0x00000208+((x)*32))
+#define  NV50TCL_RT_FORMAT__SIZE							0x00000008
+#define   NV50TCL_RT_FORMAT_32BPP							0x000000cf
+#define   NV50TCL_RT_FORMAT_24BPP							0x000000e6
+#define   NV50TCL_RT_FORMAT_16BPP							0x000000e8
+#define   NV50TCL_RT_FORMAT_8BPP							0x000000f3
+#define   NV50TCL_RT_FORMAT_15BPP							0x000000f8
+#define  NV50TCL_RT_TILE_UNK(x)								(0x0000020c+((x)*32))
+#define  NV50TCL_RT_TILE_UNK__SIZE							0x00000008
+#define  NV50TCL_RT_UNK4(x)								(0x00000210+((x)*32))
+#define  NV50TCL_RT_UNK4__SIZE								0x00000008
+#define  NV50TCL_VTX_ATTR_1F(x)								(0x00000300+((x)*4))
+#define  NV50TCL_VTX_ATTR_1F__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_2F_X(x)							(0x00000380+((x)*8))
+#define  NV50TCL_VTX_ATTR_2F_X__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_2F_Y(x)							(0x00000384+((x)*8))
+#define  NV50TCL_VTX_ATTR_2F_Y__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_3F_X(x)							(0x00000400+((x)*16))
+#define  NV50TCL_VTX_ATTR_3F_X__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_3F_Y(x)							(0x00000404+((x)*16))
+#define  NV50TCL_VTX_ATTR_3F_Y__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_3F_Z(x)							(0x00000408+((x)*16))
+#define  NV50TCL_VTX_ATTR_3F_Z__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_3F_W(x)							(0x0000040c+((x)*16))
+#define  NV50TCL_VTX_ATTR_3F_W__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_4F_X(x)							(0x00000500+((x)*16))
+#define  NV50TCL_VTX_ATTR_4F_X__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_4F_Y(x)							(0x00000504+((x)*16))
+#define  NV50TCL_VTX_ATTR_4F_Y__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_4F_Z(x)							(0x00000508+((x)*16))
+#define  NV50TCL_VTX_ATTR_4F_Z__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_4F_W(x)							(0x0000050c+((x)*16))
+#define  NV50TCL_VTX_ATTR_4F_W__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_2I(x)								(0x00000680+((x)*4))
+#define  NV50TCL_VTX_ATTR_2I__SIZE							0x00000010
+#define   NV50TCL_VTX_ATTR_2I_X_SHIFT							0
+#define   NV50TCL_VTX_ATTR_2I_X_MASK							0x0000ffff
+#define   NV50TCL_VTX_ATTR_2I_Y_SHIFT							16
+#define   NV50TCL_VTX_ATTR_2I_Y_MASK							0xffff0000
+#define  NV50TCL_VTX_ATTR_4I_0(x)							(0x00000700+((x)*8))
+#define  NV50TCL_VTX_ATTR_4I_0__SIZE							0x00000010
+#define   NV50TCL_VTX_ATTR_4I_0_X_SHIFT							0
+#define   NV50TCL_VTX_ATTR_4I_0_X_MASK							0x0000ffff
+#define   NV50TCL_VTX_ATTR_4I_0_Y_SHIFT							16
+#define   NV50TCL_VTX_ATTR_4I_0_Y_MASK							0xffff0000
+#define  NV50TCL_VTX_ATTR_4I_1(x)							(0x00000704+((x)*8))
+#define  NV50TCL_VTX_ATTR_4I_1__SIZE							0x00000010
+#define   NV50TCL_VTX_ATTR_4I_1_Z_SHIFT							0
+#define   NV50TCL_VTX_ATTR_4I_1_Z_MASK							0x0000ffff
+#define   NV50TCL_VTX_ATTR_4I_1_W_SHIFT							16
+#define   NV50TCL_VTX_ATTR_4I_1_W_MASK							0xffff0000
+#define  NV50TCL_VTX_ATTR_4NI_0(x)							(0x00000780+((x)*8))
+#define  NV50TCL_VTX_ATTR_4NI_0__SIZE							0x00000010
+#define   NV50TCL_VTX_ATTR_4NI_0_X_SHIFT						0
+#define   NV50TCL_VTX_ATTR_4NI_0_X_MASK							0x0000ffff
+#define   NV50TCL_VTX_ATTR_4NI_0_Y_SHIFT						16
+#define   NV50TCL_VTX_ATTR_4NI_0_Y_MASK							0xffff0000
+#define  NV50TCL_VTX_ATTR_4NI_1(x)							(0x00000784+((x)*8))
+#define  NV50TCL_VTX_ATTR_4NI_1__SIZE							0x00000010
+#define   NV50TCL_VTX_ATTR_4NI_1_Z_SHIFT						0
+#define   NV50TCL_VTX_ATTR_4NI_1_Z_MASK							0x0000ffff
+#define   NV50TCL_VTX_ATTR_4NI_1_W_SHIFT						16
+#define   NV50TCL_VTX_ATTR_4NI_1_W_MASK							0xffff0000
+#define  NV50TCL_VERTEX_ARRAY_FORMAT(x)							(0x00000900+((x)*16))
+#define  NV50TCL_VERTEX_ARRAY_FORMAT__SIZE						0x00000010
+#define  NV50TCL_VIEWPORT_UNK0(x)							(0x00000a00+((x)*4))
+#define  NV50TCL_VIEWPORT_UNK0__SIZE							0x00000003
+#define  NV50TCL_VIEWPORT_UNK1(x)							(0x00000a0c+((x)*4))
+#define  NV50TCL_VIEWPORT_UNK1__SIZE							0x00000003
+#define  NV50TCL_VIEWPORT_HORIZ								0x00000c00
+#define   NV50TCL_VIEWPORT_HORIZ_X_SHIFT						0
+#define   NV50TCL_VIEWPORT_HORIZ_X_MASK							0x0000ffff
+#define   NV50TCL_VIEWPORT_HORIZ_W_SHIFT						16
+#define   NV50TCL_VIEWPORT_HORIZ_W_MASK							0xffff0000
+#define  NV50TCL_VIEWPORT_VERT								0x00000c04
+#define   NV50TCL_VIEWPORT_VERT_Y_SHIFT							0
+#define   NV50TCL_VIEWPORT_VERT_Y_MASK							0x0000ffff
+#define   NV50TCL_VIEWPORT_VERT_H_SHIFT							16
+#define   NV50TCL_VIEWPORT_VERT_H_MASK							0xffff0000
+#define  NV50TCL_DEPTH_RANGE_NEAR							0x00000c08
+#define  NV50TCL_DEPTH_RANGE_FAR							0x00000c0c
+#define  NV50TCL_VIEWPORT_CLIP_HORIZ(x)							(0x00000d00+((x)*8))
+#define  NV50TCL_VIEWPORT_CLIP_HORIZ__SIZE						0x00000008
+#define  NV50TCL_VIEWPORT_CLIP_VERT(x)							(0x00000d04+((x)*8))
+#define  NV50TCL_VIEWPORT_CLIP_VERT__SIZE						0x00000008
+#define  NV50TCL_VERTEX_BUFFER_FIRST							0x00000d74
+#define  NV50TCL_VERTEX_BUFFER_COUNT							0x00000d78
+#define  NV50TCL_CLEAR_COLOR(x)								(0x00000d80+((x)*4))
+#define  NV50TCL_CLEAR_COLOR__SIZE							0x00000004
+#define  NV50TCL_CLEAR_DEPTH								0x00000d90
+#define  NV50TCL_CLEAR_STENCIL								0x00000da0
+#define  NV50TCL_POLYGON_MODE_FRONT							0x00000dac
+#define   NV50TCL_POLYGON_MODE_FRONT_POINT						0x00001b00
+#define   NV50TCL_POLYGON_MODE_FRONT_LINE						0x00001b01
+#define   NV50TCL_POLYGON_MODE_FRONT_FILL						0x00001b02
+#define  NV50TCL_POLYGON_MODE_BACK							0x00000db0
+#define   NV50TCL_POLYGON_MODE_BACK_POINT						0x00001b00
+#define   NV50TCL_POLYGON_MODE_BACK_LINE						0x00001b01
+#define   NV50TCL_POLYGON_MODE_BACK_FILL						0x00001b02
+#define  NV50TCL_POLYGON_SMOOTH_ENABLE							0x00000db4
+#define  NV50TCL_POLYGON_OFFSET_POINT_ENABLE						0x00000dc0
+#define  NV50TCL_POLYGON_OFFSET_LINE_ENABLE						0x00000dc4
+#define  NV50TCL_POLYGON_OFFSET_FILL_ENABLE						0x00000dc8
+#define  NV50TCL_SCISSOR_HORIZ								0x00000e04
+#define   NV50TCL_SCISSOR_HORIZ_L_SHIFT							0
+#define   NV50TCL_SCISSOR_HORIZ_L_MASK							0x0000ffff
+#define   NV50TCL_SCISSOR_HORIZ_R_SHIFT							16
+#define   NV50TCL_SCISSOR_HORIZ_R_MASK							0xffff0000
+#define  NV50TCL_SCISSOR_VERT								0x00000e08
+#define   NV50TCL_SCISSOR_VERT_T_SHIFT							0
+#define   NV50TCL_SCISSOR_VERT_T_MASK							0x0000ffff
+#define   NV50TCL_SCISSOR_VERT_B_SHIFT							16
+#define   NV50TCL_SCISSOR_VERT_B_MASK							0xffff0000
+#define  NV50TCL_CB_ADDR								0x00000f00
+#define   NV50TCL_CB_ADDR_ID_SHIFT							8
+#define   NV50TCL_CB_ADDR_ID_MASK							0xffffff00
+#define   NV50TCL_CB_ADDR_BUFFER_SHIFT							0
+#define   NV50TCL_CB_ADDR_BUFFER_MASK							0x000000ff
+#define  NV50TCL_CB_DATA(x)								(0x00000f04+((x)*4))
+#define  NV50TCL_CB_DATA__SIZE								0x00000010
+#define  NV50TCL_STENCIL_FRONT_FUNC_REF							0x00000f54
+#define  NV50TCL_STENCIL_FRONT_MASK							0x00000f58
+#define  NV50TCL_STENCIL_FRONT_FUNC_MASK						0x00000f5c
+#define  NV50TCL_GP_ADDRESS_HIGH							0x00000f70
+#define  NV50TCL_GP_ADDRESS_LOW								0x00000f74
+#define  NV50TCL_VP_ADDRESS_HIGH							0x00000f7c
+#define  NV50TCL_VP_ADDRESS_LOW								0x00000f80
+#define  NV50TCL_FP_ADDRESS_HIGH							0x00000fa4
+#define  NV50TCL_FP_ADDRESS_LOW								0x00000fa8
+#define  NV50TCL_ZETA_ADDRESS_HIGH							0x00000fe0
+#define  NV50TCL_ZETA_ADDRESS_LOW							0x00000fe4
+#define  NV50TCL_UNKFF4									0x00000ff4
+#define   NV50TCL_UNKFF4_W_SHIFT							16
+#define   NV50TCL_UNKFF4_W_MASK								0xffff0000
+#define  NV50TCL_UNKFF8									0x00000ff8
+#define   NV50TCL_UNKFF8_H_SHIFT							16
+#define   NV50TCL_UNKFF8_H_MASK								0xffff0000
+#define  NV50TCL_RT_HORIZ(x)								(0x00001240+((x)*8))
+#define  NV50TCL_RT_HORIZ__SIZE								0x00000008
+#define  NV50TCL_RT_VERT(x)								(0x00001244+((x)*8))
+#define  NV50TCL_RT_VERT__SIZE								0x00000008
+#define  NV50TCL_CB_DEF_ADDRESS_HIGH							0x00001280
+#define  NV50TCL_CB_DEF_ADDRESS_LOW							0x00001284
+#define  NV50TCL_CB_DEF_SET								0x00001288
+#define   NV50TCL_CB_DEF_SET_SIZE_SHIFT							0
+#define   NV50TCL_CB_DEF_SET_SIZE_MASK							0x0000ffff
+#define   NV50TCL_CB_DEF_SET_BUFFER_SHIFT						16
+#define   NV50TCL_CB_DEF_SET_BUFFER_MASK						0xffff0000
+#define  NV50TCL_DEPTH_TEST_ENABLE							0x000012cc
+#define  NV50TCL_SHADE_MODEL								0x000012d4
+#define   NV50TCL_SHADE_MODEL_FLAT							0x00001d00
+#define   NV50TCL_SHADE_MODEL_SMOOTH							0x00001d01
+#define  NV50TCL_DEPTH_WRITE_ENABLE							0x000012e8
+#define  NV50TCL_ALPHA_TEST_ENABLE							0x000012ec
+#define  NV50TCL_DEPTH_TEST_FUNC							0x0000130c
+#define   NV50TCL_DEPTH_TEST_FUNC_NEVER							0x00000200
+#define   NV50TCL_DEPTH_TEST_FUNC_LESS							0x00000201
+#define   NV50TCL_DEPTH_TEST_FUNC_EQUAL							0x00000202
+#define   NV50TCL_DEPTH_TEST_FUNC_LEQUAL						0x00000203
+#define   NV50TCL_DEPTH_TEST_FUNC_GREATER						0x00000204
+#define   NV50TCL_DEPTH_TEST_FUNC_GREATER						0x00000204
+#define   NV50TCL_DEPTH_TEST_FUNC_NOTEQUAL						0x00000205
+#define   NV50TCL_DEPTH_TEST_FUNC_GEQUAL						0x00000206
+#define   NV50TCL_DEPTH_TEST_FUNC_ALWAYS						0x00000207
+#define  NV50TCL_ALPHA_TEST_REF								0x00001310
+#define  NV50TCL_ALPHA_TEST_FUNC							0x00001314
+#define   NV50TCL_ALPHA_TEST_FUNC_NEVER							0x00000200
+#define   NV50TCL_ALPHA_TEST_FUNC_LESS							0x00000201
+#define   NV50TCL_ALPHA_TEST_FUNC_EQUAL							0x00000202
+#define   NV50TCL_ALPHA_TEST_FUNC_LEQUAL						0x00000203
+#define   NV50TCL_ALPHA_TEST_FUNC_GREATER						0x00000204
+#define   NV50TCL_ALPHA_TEST_FUNC_GREATER						0x00000204
+#define   NV50TCL_ALPHA_TEST_FUNC_NOTEQUAL						0x00000205
+#define   NV50TCL_ALPHA_TEST_FUNC_GEQUAL						0x00000206
+#define   NV50TCL_ALPHA_TEST_FUNC_ALWAYS						0x00000207
+#define  NV50TCL_BLEND_COLOR(x)								(0x0000131c+((x)*4))
+#define  NV50TCL_BLEND_COLOR__SIZE							0x00000004
+#define  NV50TCL_BLEND_EQUATION_RGB							0x00001340
+#define   NV50TCL_BLEND_EQUATION_RGB_FUNC_ADD						0x00008006
+#define   NV50TCL_BLEND_EQUATION_RGB_MIN						0x00008007
+#define   NV50TCL_BLEND_EQUATION_RGB_MAX						0x00008008
+#define   NV50TCL_BLEND_EQUATION_RGB_FUNC_SUBTRACT					0x0000800a
+#define   NV50TCL_BLEND_EQUATION_RGB_FUNC_REVERSE_SUBTRACT				0x0000800b
+#define  NV50TCL_BLEND_FUNC_SRC_RGB							0x00001344
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ZERO						0x00000000
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE						0x00000001
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_SRC_COLOR						0x00000300
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_COLOR				0x00000301
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA						0x00000302
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_ALPHA				0x00000303
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_DST_ALPHA						0x00000304
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_ALPHA				0x00000305
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_DST_COLOR						0x00000306
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_COLOR				0x00000307
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA_SATURATE					0x00000308
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_CONSTANT_COLOR					0x00008001
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_CONSTANT_ALPHA					0x00008003
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define  NV50TCL_BLEND_FUNC_DST_RGB							0x00001348
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ZERO						0x00000000
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE						0x00000001
+#define   NV50TCL_BLEND_FUNC_DST_RGB_SRC_COLOR						0x00000300
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_COLOR				0x00000301
+#define   NV50TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA						0x00000302
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_ALPHA				0x00000303
+#define   NV50TCL_BLEND_FUNC_DST_RGB_DST_ALPHA						0x00000304
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_ALPHA				0x00000305
+#define   NV50TCL_BLEND_FUNC_DST_RGB_DST_COLOR						0x00000306
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_COLOR				0x00000307
+#define   NV50TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA_SATURATE					0x00000308
+#define   NV50TCL_BLEND_FUNC_DST_RGB_CONSTANT_COLOR					0x00008001
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define   NV50TCL_BLEND_FUNC_DST_RGB_CONSTANT_ALPHA					0x00008003
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define  NV50TCL_BLEND_EQUATION_ALPHA							0x0000134c
+#define   NV50TCL_BLEND_EQUATION_ALPHA_FUNC_ADD						0x00008006
+#define   NV50TCL_BLEND_EQUATION_ALPHA_MIN						0x00008007
+#define   NV50TCL_BLEND_EQUATION_ALPHA_MAX						0x00008008
+#define   NV50TCL_BLEND_EQUATION_ALPHA_FUNC_SUBTRACT					0x0000800a
+#define   NV50TCL_BLEND_EQUATION_ALPHA_FUNC_REVERSE_SUBTRACT				0x0000800b
+#define  NV50TCL_BLEND_FUNC_SRC_ALPHA							0x00001350
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ZERO						0x00000000
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE						0x00000001
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_COLOR					0x00000300
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_COLOR				0x00000301
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA					0x00000302
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_ALPHA				0x00000303
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_DST_ALPHA					0x00000304
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_ALPHA				0x00000305
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_DST_COLOR					0x00000306
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_COLOR				0x00000307
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA_SATURATE				0x00000308
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_COLOR					0x00008001
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_ALPHA					0x00008003
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define  NV50TCL_BLEND_FUNC_DST_ALPHA							0x00001358
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ZERO						0x00000000
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE						0x00000001
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_COLOR					0x00000300
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_COLOR				0x00000301
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA					0x00000302
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_ALPHA				0x00000303
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_DST_ALPHA					0x00000304
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_ALPHA				0x00000305
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_DST_COLOR					0x00000306
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_COLOR				0x00000307
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA_SATURATE				0x00000308
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_COLOR					0x00008001
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_ALPHA					0x00008003
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define  NV50TCL_BLEND_ENABLE(x)							(0x00001360+((x)*4))
+#define  NV50TCL_BLEND_ENABLE__SIZE							0x00000008
+#define  NV50TCL_STENCIL_BACK_ENABLE							0x00001380
+#define  NV50TCL_STENCIL_BACK_OP_FAIL							0x00001384
+#define   NV50TCL_STENCIL_BACK_OP_FAIL_ZERO						0x00000000
+#define   NV50TCL_STENCIL_BACK_OP_FAIL_INVERT						0x0000150a
+#define   NV50TCL_STENCIL_BACK_OP_FAIL_KEEP						0x00001e00
+#define   NV50TCL_STENCIL_BACK_OP_FAIL_REPLACE						0x00001e01
+#define   NV50TCL_STENCIL_BACK_OP_FAIL_INCR						0x00001e02
+#define   NV50TCL_STENCIL_BACK_OP_FAIL_DECR						0x00001e03
+#define   NV50TCL_STENCIL_BACK_OP_FAIL_INCR_WRAP					0x00008507
+#define   NV50TCL_STENCIL_BACK_OP_FAIL_DECR_WRAP					0x00008508
+#define  NV50TCL_STENCIL_BACK_OP_ZFAIL							0x00001388
+#define   NV50TCL_STENCIL_BACK_OP_ZFAIL_ZERO						0x00000000
+#define   NV50TCL_STENCIL_BACK_OP_ZFAIL_INVERT						0x0000150a
+#define   NV50TCL_STENCIL_BACK_OP_ZFAIL_KEEP						0x00001e00
+#define   NV50TCL_STENCIL_BACK_OP_ZFAIL_REPLACE						0x00001e01
+#define   NV50TCL_STENCIL_BACK_OP_ZFAIL_INCR						0x00001e02
+#define   NV50TCL_STENCIL_BACK_OP_ZFAIL_DECR						0x00001e03
+#define   NV50TCL_STENCIL_BACK_OP_ZFAIL_INCR_WRAP					0x00008507
+#define   NV50TCL_STENCIL_BACK_OP_ZFAIL_DECR_WRAP					0x00008508
+#define  NV50TCL_STENCIL_BACK_OP_ZPASS							0x0000138c
+#define   NV50TCL_STENCIL_BACK_OP_ZPASS_ZERO						0x00000000
+#define   NV50TCL_STENCIL_BACK_OP_ZPASS_INVERT						0x0000150a
+#define   NV50TCL_STENCIL_BACK_OP_ZPASS_KEEP						0x00001e00
+#define   NV50TCL_STENCIL_BACK_OP_ZPASS_REPLACE						0x00001e01
+#define   NV50TCL_STENCIL_BACK_OP_ZPASS_INCR						0x00001e02
+#define   NV50TCL_STENCIL_BACK_OP_ZPASS_DECR						0x00001e03
+#define   NV50TCL_STENCIL_BACK_OP_ZPASS_INCR_WRAP					0x00008507
+#define   NV50TCL_STENCIL_BACK_OP_ZPASS_DECR_WRAP					0x00008508
+#define  NV50TCL_STENCIL_BACK_FUNC_FUNC							0x00001390
+#define   NV50TCL_STENCIL_BACK_FUNC_FUNC_NEVER						0x00000200
+#define   NV50TCL_STENCIL_BACK_FUNC_FUNC_LESS						0x00000201
+#define   NV50TCL_STENCIL_BACK_FUNC_FUNC_EQUAL						0x00000202
+#define   NV50TCL_STENCIL_BACK_FUNC_FUNC_LEQUAL						0x00000203
+#define   NV50TCL_STENCIL_BACK_FUNC_FUNC_GREATER					0x00000204
+#define   NV50TCL_STENCIL_BACK_FUNC_FUNC_GREATER					0x00000204
+#define   NV50TCL_STENCIL_BACK_FUNC_FUNC_NOTEQUAL					0x00000205
+#define   NV50TCL_STENCIL_BACK_FUNC_FUNC_GEQUAL						0x00000206
+#define   NV50TCL_STENCIL_BACK_FUNC_FUNC_ALWAYS						0x00000207
+#define  NV50TCL_STENCIL_BACK_FUNC_REF							0x00001394
+#define  NV50TCL_STENCIL_BACK_MASK							0x00001398
+#define  NV50TCL_STENCIL_BACK_FUNC_MASK							0x0000139c
+#define  NV50TCL_LINE_WIDTH								0x000013b0
+#define  NV50TCL_VP_START_ID								0x0000140c
+#define  NV50TCL_GP_START_ID								0x00001410
+#define  NV50TCL_FP_START_ID								0x00001414
+#define  NV50TCL_POINT_SIZE								0x00001518
+#define  NV50TCL_TSC_ADDRESS_HIGH							0x0000155c
+#define  NV50TCL_TSC_ADDRESS_LOW							0x00001560
+#define  NV50TCL_POLYGON_OFFSET_FACTOR							0x0000156c
+#define  NV50TCL_LINE_SMOOTH_ENABLE							0x00001570
+#define  NV50TCL_TIC_ADDRESS_HIGH							0x00001574
+#define  NV50TCL_TIC_ADDRESS_LOW							0x00001578
+#define  NV50TCL_STENCIL_FRONT_ENABLE							0x00001594
+#define  NV50TCL_STENCIL_FRONT_OP_FAIL							0x00001598
+#define   NV50TCL_STENCIL_FRONT_OP_FAIL_ZERO						0x00000000
+#define   NV50TCL_STENCIL_FRONT_OP_FAIL_INVERT						0x0000150a
+#define   NV50TCL_STENCIL_FRONT_OP_FAIL_KEEP						0x00001e00
+#define   NV50TCL_STENCIL_FRONT_OP_FAIL_REPLACE						0x00001e01
+#define   NV50TCL_STENCIL_FRONT_OP_FAIL_INCR						0x00001e02
+#define   NV50TCL_STENCIL_FRONT_OP_FAIL_DECR						0x00001e03
+#define   NV50TCL_STENCIL_FRONT_OP_FAIL_INCR_WRAP					0x00008507
+#define   NV50TCL_STENCIL_FRONT_OP_FAIL_DECR_WRAP					0x00008508
+#define  NV50TCL_STENCIL_FRONT_OP_ZFAIL							0x0000159c
+#define   NV50TCL_STENCIL_FRONT_OP_ZFAIL_ZERO						0x00000000
+#define   NV50TCL_STENCIL_FRONT_OP_ZFAIL_INVERT						0x0000150a
+#define   NV50TCL_STENCIL_FRONT_OP_ZFAIL_KEEP						0x00001e00
+#define   NV50TCL_STENCIL_FRONT_OP_ZFAIL_REPLACE					0x00001e01
+#define   NV50TCL_STENCIL_FRONT_OP_ZFAIL_INCR						0x00001e02
+#define   NV50TCL_STENCIL_FRONT_OP_ZFAIL_DECR						0x00001e03
+#define   NV50TCL_STENCIL_FRONT_OP_ZFAIL_INCR_WRAP					0x00008507
+#define   NV50TCL_STENCIL_FRONT_OP_ZFAIL_DECR_WRAP					0x00008508
+#define  NV50TCL_STENCIL_FRONT_OP_ZPASS							0x000015a0
+#define   NV50TCL_STENCIL_FRONT_OP_ZPASS_ZERO						0x00000000
+#define   NV50TCL_STENCIL_FRONT_OP_ZPASS_INVERT						0x0000150a
+#define   NV50TCL_STENCIL_FRONT_OP_ZPASS_KEEP						0x00001e00
+#define   NV50TCL_STENCIL_FRONT_OP_ZPASS_REPLACE					0x00001e01
+#define   NV50TCL_STENCIL_FRONT_OP_ZPASS_INCR						0x00001e02
+#define   NV50TCL_STENCIL_FRONT_OP_ZPASS_DECR						0x00001e03
+#define   NV50TCL_STENCIL_FRONT_OP_ZPASS_INCR_WRAP					0x00008507
+#define   NV50TCL_STENCIL_FRONT_OP_ZPASS_DECR_WRAP					0x00008508
+#define  NV50TCL_STENCIL_FRONT_FUNC_FUNC						0x000015a4
+#define   NV50TCL_STENCIL_FRONT_FUNC_FUNC_NEVER						0x00000200
+#define   NV50TCL_STENCIL_FRONT_FUNC_FUNC_LESS						0x00000201
+#define   NV50TCL_STENCIL_FRONT_FUNC_FUNC_EQUAL						0x00000202
+#define   NV50TCL_STENCIL_FRONT_FUNC_FUNC_LEQUAL					0x00000203
+#define   NV50TCL_STENCIL_FRONT_FUNC_FUNC_GREATER					0x00000204
+#define   NV50TCL_STENCIL_FRONT_FUNC_FUNC_GREATER					0x00000204
+#define   NV50TCL_STENCIL_FRONT_FUNC_FUNC_NOTEQUAL					0x00000205
+#define   NV50TCL_STENCIL_FRONT_FUNC_FUNC_GEQUAL					0x00000206
+#define   NV50TCL_STENCIL_FRONT_FUNC_FUNC_ALWAYS					0x00000207
+#define  NV50TCL_POLYGON_OFFSET_UNITS							0x000015bc
+#define  NV50TCL_VERTEX_BEGIN								0x000015dc
+#define   NV50TCL_VERTEX_BEGIN_POINTS							0x00000000
+#define   NV50TCL_VERTEX_BEGIN_LINES							0x00000001
+#define   NV50TCL_VERTEX_BEGIN_LINE_LOOP						0x00000002
+#define   NV50TCL_VERTEX_BEGIN_LINE_STRIP						0x00000003
+#define   NV50TCL_VERTEX_BEGIN_TRIANGLES						0x00000004
+#define   NV50TCL_VERTEX_BEGIN_TRIANGLE_STRIP						0x00000005
+#define   NV50TCL_VERTEX_BEGIN_TRIANGLE_FAN						0x00000006
+#define   NV50TCL_VERTEX_BEGIN_QUADS							0x00000007
+#define   NV50TCL_VERTEX_BEGIN_QUAD_STRIP						0x00000008
+#define   NV50TCL_VERTEX_BEGIN_POLYGON							0x00000009
+#define  NV50TCL_VERTEX_END								0x000015e0
+#define  NV50TCL_VERTEX_DATA								0x00001640
+#define  NV50TCL_VP_ATTR_EN_0								0x00001650
+#define   NV50TCL_VP_ATTR_EN_0_7_SHIFT							28
+#define   NV50TCL_VP_ATTR_EN_0_7_MASK							0xf0000000
+#define    NV50TCL_VP_ATTR_EN_0_7_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_0_7_XNNN							0x10000000
+#define    NV50TCL_VP_ATTR_EN_0_7_NYNN							0x20000000
+#define    NV50TCL_VP_ATTR_EN_0_7_XYNN							0x30000000
+#define    NV50TCL_VP_ATTR_EN_0_7_NNZN							0x40000000
+#define    NV50TCL_VP_ATTR_EN_0_7_XNZN							0x50000000
+#define    NV50TCL_VP_ATTR_EN_0_7_NYZN							0x60000000
+#define    NV50TCL_VP_ATTR_EN_0_7_XYZN							0x70000000
+#define    NV50TCL_VP_ATTR_EN_0_7_NNNW							0x80000000
+#define    NV50TCL_VP_ATTR_EN_0_7_XNNW							0x90000000
+#define    NV50TCL_VP_ATTR_EN_0_7_NYNW							0xa0000000
+#define    NV50TCL_VP_ATTR_EN_0_7_XYNW							0xb0000000
+#define    NV50TCL_VP_ATTR_EN_0_7_NNZW							0xc0000000
+#define    NV50TCL_VP_ATTR_EN_0_7_XNZW							0xd0000000
+#define    NV50TCL_VP_ATTR_EN_0_7_NYZW							0xe0000000
+#define    NV50TCL_VP_ATTR_EN_0_7_XYZW							0xf0000000
+#define   NV50TCL_VP_ATTR_EN_0_6_SHIFT							24
+#define   NV50TCL_VP_ATTR_EN_0_6_MASK							0x0f000000
+#define    NV50TCL_VP_ATTR_EN_0_6_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_0_6_XNNN							0x01000000
+#define    NV50TCL_VP_ATTR_EN_0_6_NYNN							0x02000000
+#define    NV50TCL_VP_ATTR_EN_0_6_XYNN							0x03000000
+#define    NV50TCL_VP_ATTR_EN_0_6_NNZN							0x04000000
+#define    NV50TCL_VP_ATTR_EN_0_6_XNZN							0x05000000
+#define    NV50TCL_VP_ATTR_EN_0_6_NYZN							0x06000000
+#define    NV50TCL_VP_ATTR_EN_0_6_XYZN							0x07000000
+#define    NV50TCL_VP_ATTR_EN_0_6_NNNW							0x08000000
+#define    NV50TCL_VP_ATTR_EN_0_6_XNNW							0x09000000
+#define    NV50TCL_VP_ATTR_EN_0_6_NYNW							0x0a000000
+#define    NV50TCL_VP_ATTR_EN_0_6_XYNW							0x0b000000
+#define    NV50TCL_VP_ATTR_EN_0_6_NNZW							0x0c000000
+#define    NV50TCL_VP_ATTR_EN_0_6_XNZW							0x0d000000
+#define    NV50TCL_VP_ATTR_EN_0_6_NYZW							0x0e000000
+#define    NV50TCL_VP_ATTR_EN_0_6_XYZW							0x0f000000
+#define   NV50TCL_VP_ATTR_EN_0_5_SHIFT							20
+#define   NV50TCL_VP_ATTR_EN_0_5_MASK							0x00f00000
+#define    NV50TCL_VP_ATTR_EN_0_5_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_0_5_XNNN							0x00100000
+#define    NV50TCL_VP_ATTR_EN_0_5_NYNN							0x00200000
+#define    NV50TCL_VP_ATTR_EN_0_5_XYNN							0x00300000
+#define    NV50TCL_VP_ATTR_EN_0_5_NNZN							0x00400000
+#define    NV50TCL_VP_ATTR_EN_0_5_XNZN							0x00500000
+#define    NV50TCL_VP_ATTR_EN_0_5_NYZN							0x00600000
+#define    NV50TCL_VP_ATTR_EN_0_5_XYZN							0x00700000
+#define    NV50TCL_VP_ATTR_EN_0_5_NNNW							0x00800000
+#define    NV50TCL_VP_ATTR_EN_0_5_XNNW							0x00900000
+#define    NV50TCL_VP_ATTR_EN_0_5_NYNW							0x00a00000
+#define    NV50TCL_VP_ATTR_EN_0_5_XYNW							0x00b00000
+#define    NV50TCL_VP_ATTR_EN_0_5_NNZW							0x00c00000
+#define    NV50TCL_VP_ATTR_EN_0_5_XNZW							0x00d00000
+#define    NV50TCL_VP_ATTR_EN_0_5_NYZW							0x00e00000
+#define    NV50TCL_VP_ATTR_EN_0_5_XYZW							0x00f00000
+#define   NV50TCL_VP_ATTR_EN_0_4_SHIFT							16
+#define   NV50TCL_VP_ATTR_EN_0_4_MASK							0x000f0000
+#define    NV50TCL_VP_ATTR_EN_0_4_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_0_4_XNNN							0x00010000
+#define    NV50TCL_VP_ATTR_EN_0_4_NYNN							0x00020000
+#define    NV50TCL_VP_ATTR_EN_0_4_XYNN							0x00030000
+#define    NV50TCL_VP_ATTR_EN_0_4_NNZN							0x00040000
+#define    NV50TCL_VP_ATTR_EN_0_4_XNZN							0x00050000
+#define    NV50TCL_VP_ATTR_EN_0_4_NYZN							0x00060000
+#define    NV50TCL_VP_ATTR_EN_0_4_XYZN							0x00070000
+#define    NV50TCL_VP_ATTR_EN_0_4_NNNW							0x00080000
+#define    NV50TCL_VP_ATTR_EN_0_4_XNNW							0x00090000
+#define    NV50TCL_VP_ATTR_EN_0_4_NYNW							0x000a0000
+#define    NV50TCL_VP_ATTR_EN_0_4_XYNW							0x000b0000
+#define    NV50TCL_VP_ATTR_EN_0_4_NNZW							0x000c0000
+#define    NV50TCL_VP_ATTR_EN_0_4_XNZW							0x000d0000
+#define    NV50TCL_VP_ATTR_EN_0_4_NYZW							0x000e0000
+#define    NV50TCL_VP_ATTR_EN_0_4_XYZW							0x000f0000
+#define   NV50TCL_VP_ATTR_EN_0_3_SHIFT							12
+#define   NV50TCL_VP_ATTR_EN_0_3_MASK							0x0000f000
+#define    NV50TCL_VP_ATTR_EN_0_3_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_0_3_XNNN							0x00001000
+#define    NV50TCL_VP_ATTR_EN_0_3_NYNN							0x00002000
+#define    NV50TCL_VP_ATTR_EN_0_3_XYNN							0x00003000
+#define    NV50TCL_VP_ATTR_EN_0_3_NNZN							0x00004000
+#define    NV50TCL_VP_ATTR_EN_0_3_XNZN							0x00005000
+#define    NV50TCL_VP_ATTR_EN_0_3_NYZN							0x00006000
+#define    NV50TCL_VP_ATTR_EN_0_3_XYZN							0x00007000
+#define    NV50TCL_VP_ATTR_EN_0_3_NNNW							0x00008000
+#define    NV50TCL_VP_ATTR_EN_0_3_XNNW							0x00009000
+#define    NV50TCL_VP_ATTR_EN_0_3_NYNW							0x0000a000
+#define    NV50TCL_VP_ATTR_EN_0_3_XYNW							0x0000b000
+#define    NV50TCL_VP_ATTR_EN_0_3_NNZW							0x0000c000
+#define    NV50TCL_VP_ATTR_EN_0_3_XNZW							0x0000d000
+#define    NV50TCL_VP_ATTR_EN_0_3_NYZW							0x0000e000
+#define    NV50TCL_VP_ATTR_EN_0_3_XYZW							0x0000f000
+#define   NV50TCL_VP_ATTR_EN_0_2_SHIFT							8
+#define   NV50TCL_VP_ATTR_EN_0_2_MASK							0x00000f00
+#define    NV50TCL_VP_ATTR_EN_0_2_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_0_2_XNNN							0x00000100
+#define    NV50TCL_VP_ATTR_EN_0_2_NYNN							0x00000200
+#define    NV50TCL_VP_ATTR_EN_0_2_XYNN							0x00000300
+#define    NV50TCL_VP_ATTR_EN_0_2_NNZN							0x00000400
+#define    NV50TCL_VP_ATTR_EN_0_2_XNZN							0x00000500
+#define    NV50TCL_VP_ATTR_EN_0_2_NYZN							0x00000600
+#define    NV50TCL_VP_ATTR_EN_0_2_XYZN							0x00000700
+#define    NV50TCL_VP_ATTR_EN_0_2_NNNW							0x00000800
+#define    NV50TCL_VP_ATTR_EN_0_2_XNNW							0x00000900
+#define    NV50TCL_VP_ATTR_EN_0_2_NYNW							0x00000a00
+#define    NV50TCL_VP_ATTR_EN_0_2_XYNW							0x00000b00
+#define    NV50TCL_VP_ATTR_EN_0_2_NNZW							0x00000c00
+#define    NV50TCL_VP_ATTR_EN_0_2_XNZW							0x00000d00
+#define    NV50TCL_VP_ATTR_EN_0_2_NYZW							0x00000e00
+#define    NV50TCL_VP_ATTR_EN_0_2_XYZW							0x00000f00
+#define   NV50TCL_VP_ATTR_EN_0_1_SHIFT							4
+#define   NV50TCL_VP_ATTR_EN_0_1_MASK							0x000000f0
+#define    NV50TCL_VP_ATTR_EN_0_1_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_0_1_XNNN							0x00000010
+#define    NV50TCL_VP_ATTR_EN_0_1_NYNN							0x00000020
+#define    NV50TCL_VP_ATTR_EN_0_1_XYNN							0x00000030
+#define    NV50TCL_VP_ATTR_EN_0_1_NNZN							0x00000040
+#define    NV50TCL_VP_ATTR_EN_0_1_XNZN							0x00000050
+#define    NV50TCL_VP_ATTR_EN_0_1_NYZN							0x00000060
+#define    NV50TCL_VP_ATTR_EN_0_1_XYZN							0x00000070
+#define    NV50TCL_VP_ATTR_EN_0_1_NNNW							0x00000080
+#define    NV50TCL_VP_ATTR_EN_0_1_XNNW							0x00000090
+#define    NV50TCL_VP_ATTR_EN_0_1_NYNW							0x000000a0
+#define    NV50TCL_VP_ATTR_EN_0_1_XYNW							0x000000b0
+#define    NV50TCL_VP_ATTR_EN_0_1_NNZW							0x000000c0
+#define    NV50TCL_VP_ATTR_EN_0_1_XNZW							0x000000d0
+#define    NV50TCL_VP_ATTR_EN_0_1_NYZW							0x000000e0
+#define    NV50TCL_VP_ATTR_EN_0_1_XYZW							0x000000f0
+#define   NV50TCL_VP_ATTR_EN_0_0_SHIFT							0
+#define   NV50TCL_VP_ATTR_EN_0_0_MASK							0x0000000f
+#define    NV50TCL_VP_ATTR_EN_0_0_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_0_0_XNNN							0x00000001
+#define    NV50TCL_VP_ATTR_EN_0_0_NYNN							0x00000002
+#define    NV50TCL_VP_ATTR_EN_0_0_XYNN							0x00000003
+#define    NV50TCL_VP_ATTR_EN_0_0_NNZN							0x00000004
+#define    NV50TCL_VP_ATTR_EN_0_0_XNZN							0x00000005
+#define    NV50TCL_VP_ATTR_EN_0_0_NYZN							0x00000006
+#define    NV50TCL_VP_ATTR_EN_0_0_XYZN							0x00000007
+#define    NV50TCL_VP_ATTR_EN_0_0_NNNW							0x00000008
+#define    NV50TCL_VP_ATTR_EN_0_0_XNNW							0x00000009
+#define    NV50TCL_VP_ATTR_EN_0_0_NYNW							0x0000000a
+#define    NV50TCL_VP_ATTR_EN_0_0_XYNW							0x0000000b
+#define    NV50TCL_VP_ATTR_EN_0_0_NNZW							0x0000000c
+#define    NV50TCL_VP_ATTR_EN_0_0_XNZW							0x0000000d
+#define    NV50TCL_VP_ATTR_EN_0_0_NYZW							0x0000000e
+#define    NV50TCL_VP_ATTR_EN_0_0_XYZW							0x0000000f
+#define  NV50TCL_VP_ATTR_EN_1								0x00001654
+#define   NV50TCL_VP_ATTR_EN_1_15_SHIFT							28
+#define   NV50TCL_VP_ATTR_EN_1_15_MASK							0xf0000000
+#define    NV50TCL_VP_ATTR_EN_1_15_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_1_15_XNNN							0x10000000
+#define    NV50TCL_VP_ATTR_EN_1_15_NYNN							0x20000000
+#define    NV50TCL_VP_ATTR_EN_1_15_XYNN							0x30000000
+#define    NV50TCL_VP_ATTR_EN_1_15_NNZN							0x40000000
+#define    NV50TCL_VP_ATTR_EN_1_15_XNZN							0x50000000
+#define    NV50TCL_VP_ATTR_EN_1_15_NYZN							0x60000000
+#define    NV50TCL_VP_ATTR_EN_1_15_XYZN							0x70000000
+#define    NV50TCL_VP_ATTR_EN_1_15_NNNW							0x80000000
+#define    NV50TCL_VP_ATTR_EN_1_15_XNNW							0x90000000
+#define    NV50TCL_VP_ATTR_EN_1_15_NYNW							0xa0000000
+#define    NV50TCL_VP_ATTR_EN_1_15_XYNW							0xb0000000
+#define    NV50TCL_VP_ATTR_EN_1_15_NNZW							0xc0000000
+#define    NV50TCL_VP_ATTR_EN_1_15_XNZW							0xd0000000
+#define    NV50TCL_VP_ATTR_EN_1_15_NYZW							0xe0000000
+#define    NV50TCL_VP_ATTR_EN_1_15_XYZW							0xf0000000
+#define   NV50TCL_VP_ATTR_EN_1_14_SHIFT							24
+#define   NV50TCL_VP_ATTR_EN_1_14_MASK							0x0f000000
+#define    NV50TCL_VP_ATTR_EN_1_14_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_1_14_XNNN							0x01000000
+#define    NV50TCL_VP_ATTR_EN_1_14_NYNN							0x02000000
+#define    NV50TCL_VP_ATTR_EN_1_14_XYNN							0x03000000
+#define    NV50TCL_VP_ATTR_EN_1_14_NNZN							0x04000000
+#define    NV50TCL_VP_ATTR_EN_1_14_XNZN							0x05000000
+#define    NV50TCL_VP_ATTR_EN_1_14_NYZN							0x06000000
+#define    NV50TCL_VP_ATTR_EN_1_14_XYZN							0x07000000
+#define    NV50TCL_VP_ATTR_EN_1_14_NNNW							0x08000000
+#define    NV50TCL_VP_ATTR_EN_1_14_XNNW							0x09000000
+#define    NV50TCL_VP_ATTR_EN_1_14_NYNW							0x0a000000
+#define    NV50TCL_VP_ATTR_EN_1_14_XYNW							0x0b000000
+#define    NV50TCL_VP_ATTR_EN_1_14_NNZW							0x0c000000
+#define    NV50TCL_VP_ATTR_EN_1_14_XNZW							0x0d000000
+#define    NV50TCL_VP_ATTR_EN_1_14_NYZW							0x0e000000
+#define    NV50TCL_VP_ATTR_EN_1_14_XYZW							0x0f000000
+#define   NV50TCL_VP_ATTR_EN_1_13_SHIFT							20
+#define   NV50TCL_VP_ATTR_EN_1_13_MASK							0x00f00000
+#define    NV50TCL_VP_ATTR_EN_1_13_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_1_13_XNNN							0x00100000
+#define    NV50TCL_VP_ATTR_EN_1_13_NYNN							0x00200000
+#define    NV50TCL_VP_ATTR_EN_1_13_XYNN							0x00300000
+#define    NV50TCL_VP_ATTR_EN_1_13_NNZN							0x00400000
+#define    NV50TCL_VP_ATTR_EN_1_13_XNZN							0x00500000
+#define    NV50TCL_VP_ATTR_EN_1_13_NYZN							0x00600000
+#define    NV50TCL_VP_ATTR_EN_1_13_XYZN							0x00700000
+#define    NV50TCL_VP_ATTR_EN_1_13_NNNW							0x00800000
+#define    NV50TCL_VP_ATTR_EN_1_13_XNNW							0x00900000
+#define    NV50TCL_VP_ATTR_EN_1_13_NYNW							0x00a00000
+#define    NV50TCL_VP_ATTR_EN_1_13_XYNW							0x00b00000
+#define    NV50TCL_VP_ATTR_EN_1_13_NNZW							0x00c00000
+#define    NV50TCL_VP_ATTR_EN_1_13_XNZW							0x00d00000
+#define    NV50TCL_VP_ATTR_EN_1_13_NYZW							0x00e00000
+#define    NV50TCL_VP_ATTR_EN_1_13_XYZW							0x00f00000
+#define   NV50TCL_VP_ATTR_EN_1_12_SHIFT							16
+#define   NV50TCL_VP_ATTR_EN_1_12_MASK							0x000f0000
+#define    NV50TCL_VP_ATTR_EN_1_12_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_1_12_XNNN							0x00010000
+#define    NV50TCL_VP_ATTR_EN_1_12_NYNN							0x00020000
+#define    NV50TCL_VP_ATTR_EN_1_12_XYNN							0x00030000
+#define    NV50TCL_VP_ATTR_EN_1_12_NNZN							0x00040000
+#define    NV50TCL_VP_ATTR_EN_1_12_XNZN							0x00050000
+#define    NV50TCL_VP_ATTR_EN_1_12_NYZN							0x00060000
+#define    NV50TCL_VP_ATTR_EN_1_12_XYZN							0x00070000
+#define    NV50TCL_VP_ATTR_EN_1_12_NNNW							0x00080000
+#define    NV50TCL_VP_ATTR_EN_1_12_XNNW							0x00090000
+#define    NV50TCL_VP_ATTR_EN_1_12_NYNW							0x000a0000
+#define    NV50TCL_VP_ATTR_EN_1_12_XYNW							0x000b0000
+#define    NV50TCL_VP_ATTR_EN_1_12_NNZW							0x000c0000
+#define    NV50TCL_VP_ATTR_EN_1_12_XNZW							0x000d0000
+#define    NV50TCL_VP_ATTR_EN_1_12_NYZW							0x000e0000
+#define    NV50TCL_VP_ATTR_EN_1_12_XYZW							0x000f0000
+#define   NV50TCL_VP_ATTR_EN_1_11_SHIFT							12
+#define   NV50TCL_VP_ATTR_EN_1_11_MASK							0x0000f000
+#define    NV50TCL_VP_ATTR_EN_1_11_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_1_11_XNNN							0x00001000
+#define    NV50TCL_VP_ATTR_EN_1_11_NYNN							0x00002000
+#define    NV50TCL_VP_ATTR_EN_1_11_XYNN							0x00003000
+#define    NV50TCL_VP_ATTR_EN_1_11_NNZN							0x00004000
+#define    NV50TCL_VP_ATTR_EN_1_11_XNZN							0x00005000
+#define    NV50TCL_VP_ATTR_EN_1_11_NYZN							0x00006000
+#define    NV50TCL_VP_ATTR_EN_1_11_XYZN							0x00007000
+#define    NV50TCL_VP_ATTR_EN_1_11_NNNW							0x00008000
+#define    NV50TCL_VP_ATTR_EN_1_11_XNNW							0x00009000
+#define    NV50TCL_VP_ATTR_EN_1_11_NYNW							0x0000a000
+#define    NV50TCL_VP_ATTR_EN_1_11_XYNW							0x0000b000
+#define    NV50TCL_VP_ATTR_EN_1_11_NNZW							0x0000c000
+#define    NV50TCL_VP_ATTR_EN_1_11_XNZW							0x0000d000
+#define    NV50TCL_VP_ATTR_EN_1_11_NYZW							0x0000e000
+#define    NV50TCL_VP_ATTR_EN_1_11_XYZW							0x0000f000
+#define   NV50TCL_VP_ATTR_EN_1_10_SHIFT							8
+#define   NV50TCL_VP_ATTR_EN_1_10_MASK							0x00000f00
+#define    NV50TCL_VP_ATTR_EN_1_10_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_1_10_XNNN							0x00000100
+#define    NV50TCL_VP_ATTR_EN_1_10_NYNN							0x00000200
+#define    NV50TCL_VP_ATTR_EN_1_10_XYNN							0x00000300
+#define    NV50TCL_VP_ATTR_EN_1_10_NNZN							0x00000400
+#define    NV50TCL_VP_ATTR_EN_1_10_XNZN							0x00000500
+#define    NV50TCL_VP_ATTR_EN_1_10_NYZN							0x00000600
+#define    NV50TCL_VP_ATTR_EN_1_10_XYZN							0x00000700
+#define    NV50TCL_VP_ATTR_EN_1_10_NNNW							0x00000800
+#define    NV50TCL_VP_ATTR_EN_1_10_XNNW							0x00000900
+#define    NV50TCL_VP_ATTR_EN_1_10_NYNW							0x00000a00
+#define    NV50TCL_VP_ATTR_EN_1_10_XYNW							0x00000b00
+#define    NV50TCL_VP_ATTR_EN_1_10_NNZW							0x00000c00
+#define    NV50TCL_VP_ATTR_EN_1_10_XNZW							0x00000d00
+#define    NV50TCL_VP_ATTR_EN_1_10_NYZW							0x00000e00
+#define    NV50TCL_VP_ATTR_EN_1_10_XYZW							0x00000f00
+#define   NV50TCL_VP_ATTR_EN_1_9_SHIFT							4
+#define   NV50TCL_VP_ATTR_EN_1_9_MASK							0x000000f0
+#define    NV50TCL_VP_ATTR_EN_1_9_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_1_9_XNNN							0x00000010
+#define    NV50TCL_VP_ATTR_EN_1_9_NYNN							0x00000020
+#define    NV50TCL_VP_ATTR_EN_1_9_XYNN							0x00000030
+#define    NV50TCL_VP_ATTR_EN_1_9_NNZN							0x00000040
+#define    NV50TCL_VP_ATTR_EN_1_9_XNZN							0x00000050
+#define    NV50TCL_VP_ATTR_EN_1_9_NYZN							0x00000060
+#define    NV50TCL_VP_ATTR_EN_1_9_XYZN							0x00000070
+#define    NV50TCL_VP_ATTR_EN_1_9_NNNW							0x00000080
+#define    NV50TCL_VP_ATTR_EN_1_9_XNNW							0x00000090
+#define    NV50TCL_VP_ATTR_EN_1_9_NYNW							0x000000a0
+#define    NV50TCL_VP_ATTR_EN_1_9_XYNW							0x000000b0
+#define    NV50TCL_VP_ATTR_EN_1_9_NNZW							0x000000c0
+#define    NV50TCL_VP_ATTR_EN_1_9_XNZW							0x000000d0
+#define    NV50TCL_VP_ATTR_EN_1_9_NYZW							0x000000e0
+#define    NV50TCL_VP_ATTR_EN_1_9_XYZW							0x000000f0
+#define   NV50TCL_VP_ATTR_EN_1_8_SHIFT							0
+#define   NV50TCL_VP_ATTR_EN_1_8_MASK							0x0000000f
+#define    NV50TCL_VP_ATTR_EN_1_8_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_1_8_XNNN							0x00000001
+#define    NV50TCL_VP_ATTR_EN_1_8_NYNN							0x00000002
+#define    NV50TCL_VP_ATTR_EN_1_8_XYNN							0x00000003
+#define    NV50TCL_VP_ATTR_EN_1_8_NNZN							0x00000004
+#define    NV50TCL_VP_ATTR_EN_1_8_XNZN							0x00000005
+#define    NV50TCL_VP_ATTR_EN_1_8_NYZN							0x00000006
+#define    NV50TCL_VP_ATTR_EN_1_8_XYZN							0x00000007
+#define    NV50TCL_VP_ATTR_EN_1_8_NNNW							0x00000008
+#define    NV50TCL_VP_ATTR_EN_1_8_XNNW							0x00000009
+#define    NV50TCL_VP_ATTR_EN_1_8_NYNW							0x0000000a
+#define    NV50TCL_VP_ATTR_EN_1_8_XYNW							0x0000000b
+#define    NV50TCL_VP_ATTR_EN_1_8_NNZW							0x0000000c
+#define    NV50TCL_VP_ATTR_EN_1_8_XNZW							0x0000000d
+#define    NV50TCL_VP_ATTR_EN_1_8_NYZW							0x0000000e
+#define    NV50TCL_VP_ATTR_EN_1_8_XYZW							0x0000000f
+#define  NV50TCL_LINE_STIPPLE_ENABLE							0x0000166c
+#define  NV50TCL_LINE_STIPPLE_PATTERN							0x00001680
+#define  NV50TCL_POLYGON_STIPPLE_ENABLE							0x0000168c
+#define  NV50TCL_VP_REG_HPOS								0x000016bc
+#define   NV50TCL_VP_REG_HPOS_X_SHIFT							0
+#define   NV50TCL_VP_REG_HPOS_X_MASK							0x000000ff
+#define   NV50TCL_VP_REG_HPOS_Y_SHIFT							8
+#define   NV50TCL_VP_REG_HPOS_Y_MASK							0x0000ff00
+#define   NV50TCL_VP_REG_HPOS_Z_SHIFT							16
+#define   NV50TCL_VP_REG_HPOS_Z_MASK							0x00ff0000
+#define   NV50TCL_VP_REG_HPOS_W_SHIFT							24
+#define   NV50TCL_VP_REG_HPOS_W_MASK							0xff000000
+#define  NV50TCL_VP_REG_COL0								0x000016c0
+#define   NV50TCL_VP_REG_COL0_X_SHIFT							0
+#define   NV50TCL_VP_REG_COL0_X_MASK							0x000000ff
+#define   NV50TCL_VP_REG_COL0_Y_SHIFT							8
+#define   NV50TCL_VP_REG_COL0_Y_MASK							0x0000ff00
+#define   NV50TCL_VP_REG_COL0_Z_SHIFT							16
+#define   NV50TCL_VP_REG_COL0_Z_MASK							0x00ff0000
+#define   NV50TCL_VP_REG_COL0_W_SHIFT							24
+#define   NV50TCL_VP_REG_COL0_W_MASK							0xff000000
+#define  NV50TCL_POLYGON_STIPPLE_PATTERN(x)						(0x00001700+((x)*4))
+#define  NV50TCL_POLYGON_STIPPLE_PATTERN__SIZE						0x00000020
+#define  NV50TCL_CULL_FACE_ENABLE							0x00001918
+#define  NV50TCL_FRONT_FACE								0x0000191c
+#define   NV50TCL_FRONT_FACE_CW								0x00000900
+#define   NV50TCL_FRONT_FACE_CCW							0x00000901
+#define  NV50TCL_CULL_FACE								0x00001920
+#define   NV50TCL_CULL_FACE_FRONT							0x00000404
+#define   NV50TCL_CULL_FACE_BACK							0x00000405
+#define   NV50TCL_CULL_FACE_FRONT_AND_BACK						0x00000408
+#define  NV50TCL_LOGIC_OP_ENABLE							0x000019c4
+#define  NV50TCL_LOGIC_OP								0x000019c8
+#define   NV50TCL_LOGIC_OP_CLEAR							0x00001500
+#define   NV50TCL_LOGIC_OP_AND								0x00001501
+#define   NV50TCL_LOGIC_OP_AND_REVERSE							0x00001502
+#define   NV50TCL_LOGIC_OP_COPY								0x00001503
+#define   NV50TCL_LOGIC_OP_AND_INVERTED							0x00001504
+#define   NV50TCL_LOGIC_OP_NOOP								0x00001505
+#define   NV50TCL_LOGIC_OP_XOR								0x00001506
+#define   NV50TCL_LOGIC_OP_OR								0x00001507
+#define   NV50TCL_LOGIC_OP_NOR								0x00001508
+#define   NV50TCL_LOGIC_OP_EQUIV							0x00001509
+#define   NV50TCL_LOGIC_OP_INVERT							0x0000150a
+#define   NV50TCL_LOGIC_OP_OR_REVERSE							0x0000150b
+#define   NV50TCL_LOGIC_OP_COPY_INVERTED						0x0000150c
+#define   NV50TCL_LOGIC_OP_OR_INVERTED							0x0000150d
+#define   NV50TCL_LOGIC_OP_NAND								0x0000150e
+#define   NV50TCL_LOGIC_OP_SET								0x0000150f
+#define  NV50TCL_CLEAR_BUFFERS								0x000019d0
+#define  NV50TCL_COLOR_MASK(x)								(0x00001a00+((x)*4))
+#define  NV50TCL_COLOR_MASK__SIZE							0x00000008
+#define   NV50TCL_COLOR_MASK_R_SHIFT							0
+#define   NV50TCL_COLOR_MASK_R_MASK							0x0000000f
+#define   NV50TCL_COLOR_MASK_G_SHIFT							4
+#define   NV50TCL_COLOR_MASK_G_MASK							0x000000f0
+#define   NV50TCL_COLOR_MASK_B_SHIFT							8
+#define   NV50TCL_COLOR_MASK_B_MASK							0x00000f00
+#define   NV50TCL_COLOR_MASK_A_SHIFT							12
+#define   NV50TCL_COLOR_MASK_A_MASK							0x0000f000
+
+
+#define NV50_COMPUTE									0x000050c0
+
+#define  NV50_COMPUTE_DMA_UNK0								0x000001a0
+#define  NV50_COMPUTE_DMA_STATUS							0x000001a4
+#define  NV50_COMPUTE_DMA_UNK1								0x000001b8
+#define  NV50_COMPUTE_DMA_UNK2								0x000001bc
+#define  NV50_COMPUTE_DMA_UNK3								0x000001c0
+#define  NV50_COMPUTE_UNK4_HIGH								0x00000210
+#define  NV50_COMPUTE_UNK4_LOW								0x00000214
+#define  NV50_COMPUTE_UNK5_HIGH								0x00000218
+#define  NV50_COMPUTE_UNK5_LOW								0x0000021c
+#define  NV50_COMPUTE_UNK6_HIGH								0x00000294
+#define  NV50_COMPUTE_UNK6_LOW								0x00000298
+#define  NV50_COMPUTE_CONST_BASE_HIGH							0x000002a4
+#define  NV50_COMPUTE_CONST_BASE_LO							0x000002a8
+#define  NV50_COMPUTE_CONST_SIZE_SEG							0x000002ac
+#define  NV50_COMPUTE_REG_COUNT								0x000002c0
+#define  NV50_COMPUTE_STATUS_HIGH							0x00000310
+#define  NV50_COMPUTE_STATUS_LOW							0x00000314
+#define  NV50_COMPUTE_EXECUTE								0x0000031c
+#define  NV50_COMPUTE_USER_PARAM_COUNT							0x00000374
+#define  NV50_COMPUTE_GRIDDIM_YX							0x000003a4
+#define  NV50_COMPUTE_SHARED_SIZE							0x000003a8
+#define  NV50_COMPUTE_BLOCKDIM_YX							0x000003ac
+#define  NV50_COMPUTE_BLOCKDIM_Z							0x000003b0
+#define  NV50_COMPUTE_CALL_ADDRESS							0x000003b4
+#define  NV50_COMPUTE_GLOBAL_BASE_HIGH(x)						(0x00000400+((x)*32))
+#define  NV50_COMPUTE_GLOBAL_BASE_HIGH__SIZE						0x00000010
+#define  NV50_COMPUTE_GLOBAL_BASE_LOW(x)						(0x00000404+((x)*32))
+#define  NV50_COMPUTE_GLOBAL_BASE_LOW__SIZE						0x00000010
+#define  NV50_COMPUTE_GLOBAL_LIMIT_HIGH(x)						(0x00000408+((x)*32))
+#define  NV50_COMPUTE_GLOBAL_LIMIT_HIGH__SIZE						0x00000010
+#define  NV50_COMPUTE_GLOBAL_LIMIT_LOW(x)						(0x0000040c+((x)*32))
+#define  NV50_COMPUTE_GLOBAL_LIMIT_LOW__SIZE						0x00000010
+#define  NV50_COMPUTE_GLOBAL_UNK(x)							(0x00000410+((x)*32))
+#define  NV50_COMPUTE_GLOBAL_UNK__SIZE							0x00000010
+#define  NV50_COMPUTE_USER_PARAM(x)							(0x00000600+((x)*4))
+#define  NV50_COMPUTE_USER_PARAM__SIZE							0x00000040
+
+
+#define NV54TCL										0x00008297
+
+
+
+#endif /* NOUVEAU_REG_H */
diff --git a/src/gallium/drivers/nouveau/nouveau_device.h b/src/gallium/drivers/nouveau/nouveau_device.h
new file mode 100644
index 00000000000..e25e89fedda
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_device.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __NOUVEAU_DEVICE_H__
+#define __NOUVEAU_DEVICE_H__
+
+struct nouveau_device {
+	unsigned chipset;
+};
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_gldefs.h b/src/gallium/drivers/nouveau/nouveau_gldefs.h
new file mode 100644
index 00000000000..e1015c93a27
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_gldefs.h
@@ -0,0 +1,196 @@
+#ifndef __NOUVEAU_GLDEFS_H__
+#define __NOUVEAU_GLDEFS_H__
+
+static INLINE unsigned
+nvgl_blend_func(unsigned factor)
+{
+	switch (factor) {
+	case PIPE_BLENDFACTOR_ZERO:
+		return 0x0000;
+	case PIPE_BLENDFACTOR_ONE:
+		return 0x0001;
+	case PIPE_BLENDFACTOR_SRC_COLOR:
+		return 0x0300;
+	case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+		return 0x0301;
+	case PIPE_BLENDFACTOR_SRC_ALPHA:
+		return 0x0302;
+	case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+		return 0x0303;
+	case PIPE_BLENDFACTOR_DST_ALPHA:
+		return 0x0304;
+	case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+		return 0x0305;
+	case PIPE_BLENDFACTOR_DST_COLOR:
+		return 0x0306;
+	case PIPE_BLENDFACTOR_INV_DST_COLOR:
+		return 0x0307;
+	case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+		return 0x0308;
+	case PIPE_BLENDFACTOR_CONST_COLOR:
+		return 0x8001;
+	case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+		return 0x8002;
+	case PIPE_BLENDFACTOR_CONST_ALPHA:
+		return 0x8003;
+	case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+		return 0x8004;
+	default:
+		return 0x0000;
+	}
+}
+
+static INLINE unsigned
+nvgl_blend_eqn(unsigned func)
+{
+	switch (func) {
+	case PIPE_BLEND_ADD:
+		return 0x8006;
+	case PIPE_BLEND_MIN:
+		return 0x8007;
+	case PIPE_BLEND_MAX:
+		return 0x8008;
+	case PIPE_BLEND_SUBTRACT:
+		return 0x800a;
+	case PIPE_BLEND_REVERSE_SUBTRACT:
+		return 0x800b;
+	default:
+		return 0x8006;
+	}
+}
+
+static INLINE unsigned
+nvgl_logicop_func(unsigned func)
+{
+	switch (func) {
+	case PIPE_LOGICOP_CLEAR:
+		return 0x1500;
+	case PIPE_LOGICOP_NOR:
+		return 0x1508;
+	case PIPE_LOGICOP_AND_INVERTED:
+		return 0x1504;
+	case PIPE_LOGICOP_COPY_INVERTED:
+		return 0x150c;
+	case PIPE_LOGICOP_AND_REVERSE:
+		return 0x1502;
+	case PIPE_LOGICOP_INVERT:
+		return 0x150a;
+	case PIPE_LOGICOP_XOR:
+		return 0x1506;
+	case PIPE_LOGICOP_NAND:
+		return 0x150e;
+	case PIPE_LOGICOP_AND:
+		return 0x1501;
+	case PIPE_LOGICOP_EQUIV:
+		return 0x1509;
+	case PIPE_LOGICOP_NOOP:
+		return 0x1505;
+	case PIPE_LOGICOP_OR_INVERTED:
+		return 0x150d;
+	case PIPE_LOGICOP_COPY:
+		return 0x1503;
+	case PIPE_LOGICOP_OR_REVERSE:
+		return 0x150b;
+	case PIPE_LOGICOP_OR:
+		return 0x1507;
+	case PIPE_LOGICOP_SET:
+		return 0x150f;
+	default:
+		return 0x1505;
+	}
+}
+
+static INLINE unsigned
+nvgl_comparison_op(unsigned op)
+{
+	switch (op) {
+	case PIPE_FUNC_NEVER:
+		return 0x0200;
+	case PIPE_FUNC_LESS:
+		return 0x0201;
+	case PIPE_FUNC_EQUAL:
+		return 0x0202;
+	case PIPE_FUNC_LEQUAL:
+		return 0x0203;
+	case PIPE_FUNC_GREATER:
+		return 0x0204;
+	case PIPE_FUNC_NOTEQUAL:
+		return 0x0205;
+	case PIPE_FUNC_GEQUAL:
+		return 0x0206;
+	case PIPE_FUNC_ALWAYS:
+		return 0x0207;
+	default:
+		return 0x0207;
+	}
+}
+
+static INLINE unsigned
+nvgl_polygon_mode(unsigned mode)
+{
+	switch (mode) {
+	case PIPE_POLYGON_MODE_POINT:
+		return 0x1b00;
+	case PIPE_POLYGON_MODE_LINE:
+		return 0x1b01;
+	case PIPE_POLYGON_MODE_FILL:
+		return 0x1b02;
+	default:
+		return 0x1b02;
+	}
+}
+
+static INLINE unsigned
+nvgl_stencil_op(unsigned op)
+{
+	switch (op) {
+	case PIPE_STENCIL_OP_ZERO:
+		return 0x0000;
+	case PIPE_STENCIL_OP_INVERT:
+		return 0x150a;
+	case PIPE_STENCIL_OP_KEEP:
+		return 0x1e00;
+	case PIPE_STENCIL_OP_REPLACE:
+		return 0x1e01;
+	case PIPE_STENCIL_OP_INCR:
+		return 0x1e02;
+	case PIPE_STENCIL_OP_DECR:
+		return 0x1e03;
+	case PIPE_STENCIL_OP_INCR_WRAP:
+		return 0x8507;
+	case PIPE_STENCIL_OP_DECR_WRAP:
+		return 0x8508;
+	default:
+		return 0x1e00;
+	}
+}
+
+static INLINE unsigned
+nvgl_primitive(unsigned prim) {
+	switch (prim) {
+	case PIPE_PRIM_POINTS:
+		return 0x0001;
+	case PIPE_PRIM_LINES:
+		return 0x0002;
+	case PIPE_PRIM_LINE_LOOP:
+		return 0x0003;
+	case PIPE_PRIM_LINE_STRIP:
+		return 0x0004;
+	case PIPE_PRIM_TRIANGLES:
+		return 0x0005;
+	case PIPE_PRIM_TRIANGLE_STRIP:
+		return 0x0006;
+	case PIPE_PRIM_TRIANGLE_FAN:
+		return 0x0007;
+	case PIPE_PRIM_QUADS:
+		return 0x0008;
+	case PIPE_PRIM_QUAD_STRIP:
+		return 0x0009;
+	case PIPE_PRIM_POLYGON:
+		return 0x000a;
+	default:
+		return 0x0001;
+	}
+}
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_grobj.h b/src/gallium/drivers/nouveau/nouveau_grobj.h
new file mode 100644
index 00000000000..8f5abf90514
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_grobj.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __NOUVEAU_GROBJ_H__
+#define __NOUVEAU_GROBJ_H__
+
+#include "nouveau_channel.h"
+
+struct nouveau_grobj {
+	struct nouveau_channel *channel;
+	int grclass;
+	uint32_t handle;
+	int subc;
+};
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_notifier.h b/src/gallium/drivers/nouveau/nouveau_notifier.h
new file mode 100644
index 00000000000..35adde1e324
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_notifier.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __NOUVEAU_NOTIFIER_H__
+#define __NOUVEAU_NOTIFIER_H__
+
+#define NV_NOTIFIER_SIZE                                                      32
+#define NV_NOTIFY_TIME_0                                              0x00000000
+#define NV_NOTIFY_TIME_1                                              0x00000004
+#define NV_NOTIFY_RETURN_VALUE                                        0x00000008
+#define NV_NOTIFY_STATE                                               0x0000000C
+#define NV_NOTIFY_STATE_STATUS_MASK                                   0xFF000000
+#define NV_NOTIFY_STATE_STATUS_SHIFT                                          24
+#define NV_NOTIFY_STATE_STATUS_COMPLETED                                    0x00
+#define NV_NOTIFY_STATE_STATUS_IN_PROCESS                                   0x01
+#define NV_NOTIFY_STATE_ERROR_CODE_MASK                               0x0000FFFF
+#define NV_NOTIFY_STATE_ERROR_CODE_SHIFT                                       0
+
+struct nouveau_notifier {
+	struct nouveau_channel *channel;
+	uint32_t handle;
+};
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_push.h b/src/gallium/drivers/nouveau/nouveau_push.h
new file mode 100644
index 00000000000..54ef1c1291e
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_push.h
@@ -0,0 +1,82 @@
+#ifndef __NOUVEAU_PUSH_H__
+#define __NOUVEAU_PUSH_H__
+
+#include "nouveau/nouveau_winsys.h"
+
+#ifndef NOUVEAU_PUSH_CONTEXT
+#error undefined push context
+#endif
+
+#define OUT_RING(data) do {                                                    \
+	NOUVEAU_PUSH_CONTEXT(pc);                                              \
+	(*pc->nvws->channel->pushbuf->cur++) = (data);                         \
+} while(0)
+
+#define OUT_RINGp(src,size) do {                                               \
+	NOUVEAU_PUSH_CONTEXT(pc);                                              \
+	memcpy(pc->nvws->channel->pushbuf->cur, (src), (size) * 4);            \
+	pc->nvws->channel->pushbuf->cur += (size);                             \
+} while(0)
+
+#define OUT_RINGf(data) do {                                                   \
+	union { float v; uint32_t u; } c;                                      \
+	c.v = (data);                                                          \
+	OUT_RING(c.u);                                                         \
+} while(0)
+
+#define BEGIN_RING(obj,mthd,size) do {                                         \
+	NOUVEAU_PUSH_CONTEXT(pc);                                              \
+	if (pc->nvws->channel->pushbuf->remaining < ((size) + 1))              \
+		pc->nvws->push_flush(pc->nvws, ((size) + 1), NULL);            \
+	OUT_RING((pc->obj->subc << 13) | ((size) << 18) | (mthd));             \
+	pc->nvws->channel->pushbuf->remaining -= ((size) + 1);                 \
+} while(0)
+
+#define BEGIN_RING_NI(obj,mthd,size) do {                                      \
+	BEGIN_RING(obj, (mthd) | 0x40000000, (size));                          \
+} while(0)
+
+#define FIRE_RING(fence) do {                                                  \
+	NOUVEAU_PUSH_CONTEXT(pc);                                              \
+	pc->nvws->push_flush(pc->nvws, 0, fence);                              \
+} while(0)
+
+#define OUT_RELOC(bo,data,flags,vor,tor) do {                                  \
+	NOUVEAU_PUSH_CONTEXT(pc);                                              \
+	pc->nvws->push_reloc(pc->nvws, pc->nvws->channel->pushbuf->cur++,      \
+			     (bo), (data), (flags), (vor), (tor));             \
+} while(0)
+
+/* Raw data + flags depending on FB/TT buffer */
+#define OUT_RELOCd(bo,data,flags,vor,tor) do {                                 \
+	OUT_RELOC((bo), (data), (flags) | NOUVEAU_BO_OR, (vor), (tor));        \
+} while(0)
+
+/* FB/TT object handle */
+#define OUT_RELOCo(bo,flags) do {                                              \
+	OUT_RELOC((bo), 0, (flags) | NOUVEAU_BO_OR,                            \
+		  pc->nvws->channel->vram->handle,                             \
+		  pc->nvws->channel->gart->handle);                            \
+} while(0)
+
+/* Low 32-bits of offset */
+#define OUT_RELOCl(bo,delta,flags) do {                                        \
+	OUT_RELOC((bo), (delta), (flags) | NOUVEAU_BO_LOW, 0, 0);              \
+} while(0)
+
+/* High 32-bits of offset */
+#define OUT_RELOCh(bo,delta,flags) do {                                        \
+	OUT_RELOC((bo), (delta), (flags) | NOUVEAU_BO_HIGH, 0, 0);             \
+} while(0)
+
+/* A reloc which'll recombine into a NV_DMA_METHOD packet header */
+#define OUT_RELOCm(bo, flags, obj, mthd, size) do {                            \
+	NOUVEAU_PUSH_CONTEXT(pc);                                              \
+	if (pc->nvws->channel->pushbuf->remaining < ((size) + 1))              \
+		pc->nvws->push_flush(pc->nvws->channel, ((size) + 1), NULL);   \
+	OUT_RELOCd((bo), (pc->obj->subc << 13) | ((size) << 18) | (mthd),      \
+		   (flags), 0, 0);                                             \
+	pc->nvws->channel->pushbuf->remaining -= ((size) + 1);                 \
+} while(0)
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_pushbuf.h b/src/gallium/drivers/nouveau/nouveau_pushbuf.h
new file mode 100644
index 00000000000..19097650982
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_pushbuf.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __NOUVEAU_PUSHBUF_H__
+#define __NOUVEAU_PUSHBUF_H__
+
+struct nouveau_pushbuf {
+	struct nouveau_channel *channel;
+	unsigned remaining;
+	uint32_t *cur;
+};
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_resource.h b/src/gallium/drivers/nouveau/nouveau_resource.h
new file mode 100644
index 00000000000..1af7961d301
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_resource.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __NOUVEAU_RESOURCE_H__
+#define __NOUVEAU_RESOURCE_H__
+
+struct nouveau_resource {
+	struct nouveau_resource *prev;
+	struct nouveau_resource *next;
+
+	int in_use;
+	void *priv;
+
+	unsigned int start;
+	unsigned int size;
+};
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_stateobj.h b/src/gallium/drivers/nouveau/nouveau_stateobj.h
new file mode 100644
index 00000000000..729988b095e
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_stateobj.h
@@ -0,0 +1,158 @@
+#ifndef __NOUVEAU_STATEOBJ_H__
+#define __NOUVEAU_STATEOBJ_H__
+
+#include "pipe/p_debug.h"
+
+struct nouveau_stateobj_reloc {
+	struct pipe_buffer *bo;
+
+	unsigned offset;
+	unsigned packet;
+
+	unsigned data;
+	unsigned flags;
+	unsigned vor;
+	unsigned tor;
+};
+
+struct nouveau_stateobj {
+	int refcount;
+
+	unsigned *push;
+	struct nouveau_stateobj_reloc *reloc;
+
+	unsigned *cur;
+	unsigned cur_packet;
+	unsigned cur_reloc;
+};
+
+static INLINE struct nouveau_stateobj *
+so_new(unsigned push, unsigned reloc)
+{
+	struct nouveau_stateobj *so;
+
+	so = MALLOC(sizeof(struct nouveau_stateobj));
+	so->refcount = 0;
+	so->push = MALLOC(sizeof(unsigned) * push);
+	so->reloc = MALLOC(sizeof(struct nouveau_stateobj_reloc) * reloc);
+
+	so->cur = so->push;
+	so->cur_reloc = so->cur_packet = 0;
+
+	return so;
+}
+
+static INLINE void
+so_ref(struct nouveau_stateobj *ref, struct nouveau_stateobj **pso)
+{
+	struct nouveau_stateobj *so = *pso;
+
+	if (ref) {
+		ref->refcount++;
+	}
+
+	if (so && --so->refcount <= 0) {
+		free(so->push);
+		free(so->reloc);
+		free(so);
+	}
+
+	*pso = ref;
+}
+
+static INLINE void
+so_data(struct nouveau_stateobj *so, unsigned data)
+{
+	(*so->cur++) = (data);
+	so->cur_packet += 4;
+}
+
+static INLINE void
+so_datap(struct nouveau_stateobj *so, unsigned *data, unsigned size)
+{
+	so->cur_packet += (4 * size);
+	while (size--)
+		(*so->cur++) = (*data++);
+}
+
+static INLINE void
+so_method(struct nouveau_stateobj *so, struct nouveau_grobj *gr,
+	  unsigned mthd, unsigned size)
+{
+	so->cur_packet = (gr->subc << 13) | (1 << 18) | (mthd - 4);
+	so_data(so, (gr->subc << 13) | (size << 18) | mthd);
+}
+
+static INLINE void
+so_reloc(struct nouveau_stateobj *so, struct pipe_buffer *bo,
+	 unsigned data, unsigned flags, unsigned vor, unsigned tor)
+{
+	struct nouveau_stateobj_reloc *r = &so->reloc[so->cur_reloc++];
+	
+	r->bo = bo;
+	r->offset = so->cur - so->push;
+	r->packet = so->cur_packet;
+	r->data = data;
+	r->flags = flags;
+	r->vor = vor;
+	r->tor = tor;
+	so_data(so, data);
+}
+
+static INLINE void
+so_dump(struct nouveau_stateobj *so)
+{
+	unsigned i, nr = so->cur - so->push;
+
+	for (i = 0; i < nr; i++)
+		debug_printf("+0x%04x: 0x%08x\n", i, so->push[i]);
+}
+
+static INLINE void
+so_emit(struct nouveau_winsys *nvws, struct nouveau_stateobj *so)
+{
+	struct nouveau_pushbuf *pb = nvws->channel->pushbuf;
+	unsigned nr, i;
+
+	nr = so->cur - so->push;
+	if (pb->remaining < nr)
+		nvws->push_flush(nvws, nr, NULL);
+	pb->remaining -= nr;
+
+	memcpy(pb->cur, so->push, nr * 4);
+	for (i = 0; i < so->cur_reloc; i++) {
+		struct nouveau_stateobj_reloc *r = &so->reloc[i];
+
+		nvws->push_reloc(nvws, pb->cur + r->offset, r->bo,
+				 r->data, r->flags, r->vor, r->tor);
+	}
+	pb->cur += nr;
+}
+
+static INLINE void
+so_emit_reloc_markers(struct nouveau_winsys *nvws, struct nouveau_stateobj *so)
+{
+	struct nouveau_pushbuf *pb = nvws->channel->pushbuf;
+	unsigned i;
+
+	if (!so)
+		return;
+
+	i = so->cur_reloc << 1;
+	if (nvws->channel->pushbuf->remaining < i)
+		nvws->push_flush(nvws, i, NULL);
+	nvws->channel->pushbuf->remaining -= i;
+
+	for (i = 0; i < so->cur_reloc; i++) {
+		struct nouveau_stateobj_reloc *r = &so->reloc[i];
+
+		nvws->push_reloc(nvws, pb->cur++, r->bo, r->packet,
+				 (r->flags &
+				  (NOUVEAU_BO_VRAM | NOUVEAU_BO_GART)) |
+				 NOUVEAU_BO_DUMMY, 0, 0);
+		nvws->push_reloc(nvws, pb->cur++, r->bo, r->data,
+				 r->flags | NOUVEAU_BO_DUMMY, r->vor, r->tor);
+	}
+}
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_util.h b/src/gallium/drivers/nouveau/nouveau_util.h
new file mode 100644
index 00000000000..c92041ebeba
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_util.h
@@ -0,0 +1,64 @@
+#ifndef __NOUVEAU_UTIL_H__
+#define __NOUVEAU_UTIL_H__
+
+/* Determine how many vertices can be pushed into the command stream.
+ * Where the remaining space isn't large enough to represent all verices,
+ * split the buffer at primitive boundaries.
+ *
+ * Returns a count of vertices that can be rendered, and an index to
+ * restart drawing at after a flush.
+ */
+static INLINE unsigned
+nouveau_vbuf_split(unsigned remaining, unsigned overhead, unsigned vpp,
+		   unsigned mode, unsigned start, unsigned count,
+		   unsigned *restart)
+{
+	int max, adj = 0;
+
+	max  = remaining - overhead;
+	if (max < 0)
+		return 0;
+
+	max *= vpp;
+	if (max >= count)
+		return count;
+
+	switch (mode) {
+	case PIPE_PRIM_POINTS:
+		break;
+	case PIPE_PRIM_LINES:
+		max = max & 1;
+		break;
+	case PIPE_PRIM_TRIANGLES:
+		max = max - (max % 3);
+		break;
+	case PIPE_PRIM_QUADS:
+		max = max & 3;
+		break;
+	case PIPE_PRIM_LINE_LOOP:
+	case PIPE_PRIM_LINE_STRIP:
+		if (max < 2)
+			max = 0;
+		adj = 1;
+		break;
+	case PIPE_PRIM_POLYGON:
+	case PIPE_PRIM_TRIANGLE_STRIP:
+	case PIPE_PRIM_TRIANGLE_FAN:
+		if (max < 3)
+			max = 0;
+		adj = 2;
+		break;
+	case PIPE_PRIM_QUAD_STRIP:
+		if (max < 4)
+			max = 0;
+		adj = 3;
+		break;
+	default:
+		assert(0);
+	}
+
+	*restart = start + max - adj;
+	return max;
+}
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_winsys.h b/src/gallium/drivers/nouveau/nouveau_winsys.h
new file mode 100644
index 00000000000..a89b056244d
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_winsys.h
@@ -0,0 +1,96 @@
+#ifndef NOUVEAU_WINSYS_H
+#define NOUVEAU_WINSYS_H
+
+#include <stdint.h>
+#include "pipe/p_winsys.h"
+#include "pipe/p_defines.h"
+
+#include "nouveau/nouveau_bo.h"
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_class.h"
+#include "nouveau/nouveau_device.h"
+#include "nouveau/nouveau_grobj.h"
+#include "nouveau/nouveau_notifier.h"
+#include "nouveau/nouveau_resource.h"
+#include "nouveau/nouveau_pushbuf.h"
+
+#define NOUVEAU_CAP_HW_VTXBUF (0xbeef0000)
+#define NOUVEAU_CAP_HW_IDXBUF (0xbeef0001)
+
+#define NOUVEAU_BUFFER_USAGE_TEXTURE (1 << 16)
+#define NOUVEAU_BUFFER_USAGE_ZETA    (1 << 17)
+
+struct nouveau_winsys {
+	struct nouveau_context *nv;
+
+	struct nouveau_channel *channel;
+
+	int  (*res_init)(struct nouveau_resource **heap, unsigned start,
+			 unsigned size);
+	int  (*res_alloc)(struct nouveau_resource *heap, int size, void *priv,
+			  struct nouveau_resource **);
+	void (*res_free)(struct nouveau_resource **);
+
+	int  (*push_reloc)(struct nouveau_winsys *, void *ptr,
+			   struct pipe_buffer *, uint32_t data,
+			   uint32_t flags, uint32_t vor, uint32_t tor);
+	int  (*push_flush)(struct nouveau_winsys *, unsigned size,
+			   struct pipe_fence_handle **fence);
+			       
+	int       (*grobj_alloc)(struct nouveau_winsys *, int grclass,
+				 struct nouveau_grobj **);
+	void      (*grobj_free)(struct nouveau_grobj **);
+
+	int       (*notifier_alloc)(struct nouveau_winsys *, int count,
+				    struct nouveau_notifier **);
+	void      (*notifier_free)(struct nouveau_notifier **);
+	void      (*notifier_reset)(struct nouveau_notifier *, int id);
+	uint32_t  (*notifier_status)(struct nouveau_notifier *, int id);
+	uint32_t  (*notifier_retval)(struct nouveau_notifier *, int id);
+	int       (*notifier_wait)(struct nouveau_notifier *, int id,
+				   int status, int timeout);
+
+	int (*surface_copy)(struct nouveau_winsys *, struct pipe_surface *,
+			    unsigned, unsigned, struct pipe_surface *,
+			    unsigned, unsigned, unsigned, unsigned);
+	int (*surface_fill)(struct nouveau_winsys *, struct pipe_surface *,
+			    unsigned, unsigned, unsigned, unsigned, unsigned);
+};
+
+extern struct pipe_screen *
+nv04_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+
+extern struct pipe_context *
+nv04_create(struct pipe_screen *, unsigned pctx_id);
+
+extern struct pipe_screen *
+nv10_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+
+extern struct pipe_context *
+nv10_create(struct pipe_screen *, unsigned pctx_id);
+
+extern struct pipe_screen *
+nv20_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+
+extern struct pipe_context *
+nv20_create(struct pipe_screen *, unsigned pctx_id);
+
+extern struct pipe_screen *
+nv30_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+
+extern struct pipe_context *
+nv30_create(struct pipe_screen *, unsigned pctx_id);
+
+extern struct pipe_screen *
+nv40_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+
+extern struct pipe_context *
+nv40_create(struct pipe_screen *, unsigned pctx_id);
+
+extern struct pipe_screen *
+nv50_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+
+extern struct pipe_context *
+nv50_create(struct pipe_screen *, unsigned pctx_id);
+
+#endif
diff --git a/src/gallium/drivers/nv04/Makefile b/src/gallium/drivers/nv04/Makefile
new file mode 100644
index 00000000000..5ea51a2f420
--- /dev/null
+++ b/src/gallium/drivers/nv04/Makefile
@@ -0,0 +1,28 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv04
+
+DRIVER_SOURCES = \
+	nv04_clear.c \
+	nv04_context.c \
+	nv04_fragprog.c \
+	nv04_fragtex.c \
+	nv04_miptree.c \
+	nv04_prim_vbuf.c \
+	nv04_screen.c \
+	nv04_state.c \
+	nv04_state_emit.c \
+	nv04_surface.c \
+	nv04_vbo.c
+
+C_SOURCES = \
+	$(COMMON_SOURCES) \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/drivers/nv04/nv04_clear.c b/src/gallium/drivers/nv04/nv04_clear.c
new file mode 100644
index 00000000000..01cacd36fe1
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_clear.c
@@ -0,0 +1,12 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv04_context.h"
+
+void
+nv04_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, clearValue);
+}
diff --git a/src/gallium/drivers/nv04/nv04_context.c b/src/gallium/drivers/nv04/nv04_context.c
new file mode 100644
index 00000000000..9f75253363f
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_context.c
@@ -0,0 +1,106 @@
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+
+#include "nv04_context.h"
+#include "nv04_screen.h"
+
+static void
+nv04_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	draw_flush(nv04->draw);
+
+	FIRE_RING(fence);
+}
+
+static void
+nv04_destroy(struct pipe_context *pipe)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	if (nv04->draw)
+		draw_destroy(nv04->draw);
+
+	FREE(nv04);
+}
+
+static void
+nv04_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
+{
+}
+
+static boolean
+nv04_init_hwctx(struct nv04_context *nv04)
+{
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_NOTIFY, 1);
+	OUT_RING(0);
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_NOP, 1);
+	OUT_RING(0);
+
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_CONTROL, 1);
+	OUT_RING(0x40182800);
+//	OUT_RING(1<<20/*no cull*/);
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_BLEND, 1);
+//	OUT_RING(0x24|(1<<6)|(1<<8));
+	OUT_RING(0x120001a4);
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_FORMAT, 1);
+	OUT_RING(0x332213a1);
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_FILTER, 1);
+	OUT_RING(0x11001010);
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_COLORKEY, 1);
+	OUT_RING(0x0);
+//	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_OFFSET, 1);
+//	OUT_RING(SCREEN_OFFSET);
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_FOGCOLOR, 1);
+	OUT_RING(0xff000000);
+
+
+
+	FIRE_RING (NULL);
+	return TRUE;
+}
+
+struct pipe_context *
+nv04_create(struct pipe_screen *pscreen, unsigned pctx_id)
+{
+	struct nv04_screen *screen = nv04_screen(pscreen);
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv04_context *nv04;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nv04 = CALLOC(1, sizeof(struct nv04_context));
+	if (!nv04)
+		return NULL;
+	nv04->screen = screen;
+	nv04->pctx_id = pctx_id;
+
+	nv04->nvws = nvws;
+
+	nv04->pipe.winsys = ws;
+	nv04->pipe.screen = pscreen;
+	nv04->pipe.destroy = nv04_destroy;
+	nv04->pipe.set_edgeflags = nv04_set_edgeflags;
+	nv04->pipe.draw_arrays = nv04_draw_arrays;
+	nv04->pipe.draw_elements = nv04_draw_elements;
+	nv04->pipe.clear = nv04_clear;
+	nv04->pipe.flush = nv04_flush;
+
+	nv04_init_surface_functions(nv04);
+	nv04_init_state_functions(nv04);
+
+	nv04->draw = draw_create();
+	assert(nv04->draw);
+	draw_wide_point_threshold(nv04->draw, 0.0);
+	draw_wide_line_threshold(nv04->draw, 0.0);
+	draw_enable_line_stipple(nv04->draw, FALSE);
+	draw_enable_point_sprites(nv04->draw, FALSE);
+	draw_set_rasterize_stage(nv04->draw, nv04_draw_vbuf_stage(nv04));
+
+	nv04_init_hwctx(nv04);
+
+	return &nv04->pipe;
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_context.h b/src/gallium/drivers/nv04/nv04_context.h
new file mode 100644
index 00000000000..3e6a0852702
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_context.h
@@ -0,0 +1,146 @@
+#ifndef __NV04_CONTEXT_H__
+#define __NV04_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+
+#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
+	struct nv04_screen *ctx = nv04->screen
+#include "nouveau/nouveau_push.h"
+
+#include "nv04_state.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+#include "nv04_screen.h"
+
+#define NV04_NEW_VERTPROG	(1 << 1)
+#define NV04_NEW_FRAGPROG	(1 << 2)
+#define NV04_NEW_BLEND		(1 << 3)
+#define NV04_NEW_RAST		(1 << 4)
+#define NV04_NEW_CONTROL	(1 << 5)
+#define NV04_NEW_VIEWPORT	(1 << 6)
+#define NV04_NEW_SAMPLER	(1 << 7)
+
+struct nv04_context {
+	struct pipe_context pipe;
+
+	struct nouveau_winsys *nvws;
+	struct nv04_screen *screen;
+	unsigned pctx_id;
+
+	struct draw_context *draw;
+
+	int chipset;
+	struct nouveau_notifier *sync;
+
+	uint32_t dirty;
+
+	struct nv04_blend_state *blend;
+	struct nv04_sampler_state *sampler[PIPE_MAX_SAMPLERS];
+	struct nv04_fragtex_state fragtex;
+	struct nv04_rasterizer_state *rast;
+	struct nv04_depth_stencil_alpha_state *dsa;
+
+	struct nv04_miptree *tex_miptree[PIPE_MAX_SAMPLERS];
+	unsigned dirty_samplers;
+	unsigned fp_samplers;
+	unsigned vp_samplers;
+
+	uint32_t rt_enable;
+	struct pipe_buffer *rt[4];
+	struct pipe_buffer *zeta;
+
+	struct {
+		struct pipe_buffer *buffer;
+		uint32_t format;
+	} tex[16];
+
+	unsigned vb_enable;
+	struct {
+		struct pipe_buffer *buffer;
+		unsigned delta;
+	} vb[16];
+
+	struct vertex_info vertex_info;
+	struct {
+	
+		struct nouveau_resource *exec_heap;
+		struct nouveau_resource *data_heap;
+
+		struct nv04_vertex_program *active;
+
+		struct nv04_vertex_program *current;
+		struct pipe_buffer *constant_buf;
+	} vertprog;
+
+	struct {
+		struct nv04_fragment_program *active;
+
+		struct nv04_fragment_program *current;
+		struct pipe_buffer *constant_buf;
+	} fragprog;
+
+	struct pipe_vertex_buffer  vertex_buffer[PIPE_MAX_ATTRIBS];
+	unsigned num_vertex_buffers;
+	unsigned num_vertex_elements;
+
+	struct pipe_viewport_state viewport;
+};
+
+static INLINE struct nv04_context *
+nv04_context(struct pipe_context *pipe)
+{
+	return (struct nv04_context *)pipe;
+}
+
+extern void nv04_init_state_functions(struct nv04_context *nv04);
+extern void nv04_init_surface_functions(struct nv04_context *nv04);
+extern void nv04_init_miptree_functions(struct pipe_screen *screen);
+
+/* nv04_clear.c */
+extern void nv04_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+/* nv04_draw.c */
+extern struct draw_stage *nv04_draw_render_stage(struct nv04_context *nv04);
+
+/* nv04_fragprog.c */
+extern void nv04_fragprog_bind(struct nv04_context *,
+			       struct nv04_fragment_program *);
+extern void nv04_fragprog_destroy(struct nv04_context *,
+				  struct nv04_fragment_program *);
+
+/* nv04_fragtex.c */
+extern void nv04_fragtex_bind(struct nv04_context *);
+
+/* nv04_prim_vbuf.c */
+struct draw_stage *nv04_draw_vbuf_stage( struct nv04_context *nv04 );
+
+/* nv04_state.c and friends */
+extern void nv04_emit_hw_state(struct nv04_context *nv04);
+extern void nv04_state_tex_update(struct nv04_context *nv04);
+
+/* nv04_vbo.c */
+extern boolean nv04_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv04_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *indexBuffer,
+                    unsigned indexSize,
+                    unsigned prim, unsigned start, unsigned count);
+
+
+#endif
diff --git a/src/gallium/drivers/nv04/nv04_fragprog.c b/src/gallium/drivers/nv04/nv04_fragprog.c
new file mode 100644
index 00000000000..8a2af41fe06
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_fragprog.c
@@ -0,0 +1,21 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv04_context.h"
+
+void
+nv04_fragprog_bind(struct nv04_context *nv04, struct nv04_fragment_program *fp)
+{
+}
+
+void
+nv04_fragprog_destroy(struct nv04_context *nv04,
+		      struct nv04_fragment_program *fp)
+{
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_fragtex.c b/src/gallium/drivers/nv04/nv04_fragtex.c
new file mode 100644
index 00000000000..1b866aae199
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_fragtex.c
@@ -0,0 +1,98 @@
+#include "nv04_context.h"
+
+static INLINE int log2i(int i)
+{
+	int r = 0;
+
+	if (i & 0xffff0000) {
+		i >>= 16;
+		r += 16;
+	}
+	if (i & 0x0000ff00) {
+		i >>= 8;
+		r += 8;
+	}
+	if (i & 0x000000f0) {
+		i >>= 4;
+		r += 4;
+	}
+	if (i & 0x0000000c) {
+		i >>= 2;
+		r += 2;
+	}
+	if (i & 0x00000002) {
+		r += 1;
+	}
+	return r;
+}
+
+#define _(m,tf)                                                                \
+{                                                                              \
+  PIPE_FORMAT_##m,                                                             \
+  NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_##tf,                                               \
+}
+
+struct nv04_texture_format {
+	uint	pipe;
+	int     format;
+};
+
+static struct nv04_texture_format
+nv04_texture_formats[] = {
+	_(A8R8G8B8_UNORM, A8R8G8B8),
+	_(X8R8G8B8_UNORM, X8R8G8B8),
+	_(A1R5G5B5_UNORM, A1R5G5B5),
+	_(A4R4G4B4_UNORM, A4R4G4B4),
+	_(L8_UNORM,       Y8      ),
+	_(A8_UNORM,       Y8      ),
+};
+
+static uint32_t
+nv04_fragtex_format(uint pipe_format)
+{
+	struct nv04_texture_format *tf = nv04_texture_formats;
+	int i;
+
+	for (i=0; i< sizeof(nv04_texture_formats)/sizeof(nv04_texture_formats[0]); i++) {
+		if (tf->pipe == pipe_format)
+			return tf->format;
+		tf++;
+	}
+
+	NOUVEAU_ERR("unknown texture format %s\n", pf_name(pipe_format));
+	return 0;
+}
+
+
+static void
+nv04_fragtex_build(struct nv04_context *nv04, int unit)
+{
+	struct nv04_miptree *nv04mt = nv04->tex_miptree[unit];
+	struct pipe_texture *pt = &nv04mt->base;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_2D:
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return;
+	}
+
+	nv04->fragtex.format = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_ZOH_CORNER 
+		| NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_FOH_CORNER
+		| nv04_fragtex_format(pt->format)
+		| ( (pt->last_level + 1) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_MIPMAP_LEVELS_SHIFT )
+		| ( log2i(pt->width[0]) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_U_SHIFT )
+		| ( log2i(pt->height[0]) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_V_SHIFT )
+		| NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_EDGE
+		| NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_CLAMP_TO_EDGE
+		;
+}
+
+
+void
+nv04_fragtex_bind(struct nv04_context *nv04)
+{
+	nv04_fragtex_build(nv04, 0);
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_miptree.c b/src/gallium/drivers/nv04/nv04_miptree.c
new file mode 100644
index 00000000000..0cbb91e187b
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_miptree.c
@@ -0,0 +1,138 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "nv04_context.h"
+#include "nv04_screen.h"
+
+static void
+nv04_miptree_layout(struct nv04_miptree *nv04mt)
+{
+	struct pipe_texture *pt = &nv04mt->base;
+	uint width = pt->width[0], height = pt->height[0];
+	uint offset = 0;
+	int nr_faces, l, f;
+
+	nr_faces = 1;
+
+	for (l = 0; l <= pt->last_level; l++) {
+		pt->width[l] = width;
+		pt->height[l] = height;
+
+		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
+		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
+		
+		nv04mt->level[l].pitch = pt->width[0] * pt->block.size;
+		nv04mt->level[l].pitch = (nv04mt->level[l].pitch + 63) & ~63;
+
+		nv04mt->level[l].image_offset =
+			CALLOC(nr_faces, sizeof(unsigned));
+
+		width  = MAX2(1, width  >> 1);
+		height = MAX2(1, height >> 1);
+
+	}
+
+	for (f = 0; f < nr_faces; f++) {
+		for (l = 0; l <= pt->last_level; l++) {
+			nv04mt->level[l].image_offset[f] = offset;
+			offset += nv04mt->level[l].pitch * pt->height[l];
+		}
+	}
+
+	nv04mt->total_size = offset;
+}
+
+static struct pipe_texture *
+nv04_miptree_create(struct pipe_screen *screen, const struct pipe_texture *pt)
+{
+	struct pipe_winsys *ws = screen->winsys;
+	struct nv04_miptree *mt;
+
+	mt = MALLOC(sizeof(struct nv04_miptree));
+	if (!mt)
+		return NULL;
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = screen;
+
+	nv04_miptree_layout(mt);
+
+	mt->buffer = ws->buffer_create(ws, 256, PIPE_BUFFER_USAGE_PIXEL,
+					   mt->total_size);
+	if (!mt->buffer) {
+		FREE(mt);
+		return NULL;
+	}
+	
+	return &mt->base;
+}
+
+static void
+nv04_miptree_release(struct pipe_screen *screen, struct pipe_texture **pt)
+{
+	struct pipe_texture *mt = *pt;
+
+	*pt = NULL;
+	if (--mt->refcount <= 0) {
+		struct nv04_miptree *nv04mt = (struct nv04_miptree *)mt;
+		int l;
+
+		pipe_buffer_reference(screen, &nv04mt->buffer, NULL);
+		for (l = 0; l <= mt->last_level; l++) {
+			if (nv04mt->level[l].image_offset)
+				FREE(nv04mt->level[l].image_offset);
+		}
+		FREE(nv04mt);
+	}
+}
+
+static struct pipe_surface *
+nv04_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv04_miptree *nv04mt = (struct nv04_miptree *)pt;
+	struct pipe_surface *ps;
+
+	ps = ws->surface_alloc(ws);
+	if (!ps)
+		return NULL;
+	pipe_buffer_reference(pscreen, &ps->buffer, nv04mt->buffer);
+	ps->format = pt->format;
+		ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->block = pt->block;
+	ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->nblocksx = pt->nblocksx[level];
+	ps->nblocksy = pt->nblocksy[level];
+	ps->stride = nv04mt->level[level].pitch;
+	ps->refcount = 1;
+	ps->winsys = pscreen->winsys;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		ps->offset = nv04mt->level[level].image_offset[face];
+	} else {
+		ps->offset = nv04mt->level[level].image_offset[0];
+	}
+
+	return ps;
+}
+
+static void
+nv04_miptree_surface_del(struct pipe_screen *pscreen,
+			 struct pipe_surface **psurface)
+{
+}
+
+void
+nv04_init_miptree_functions(struct pipe_screen *pscreen)
+{
+	pscreen->texture_create = nv04_miptree_create;
+	pscreen->texture_release = nv04_miptree_release;
+	pscreen->get_tex_surface = nv04_miptree_surface_new;
+	pscreen->tex_surface_release = nv04_miptree_surface_del;
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_prim_vbuf.c b/src/gallium/drivers/nv04/nv04_prim_vbuf.c
new file mode 100644
index 00000000000..19979fff795
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_prim_vbuf.c
@@ -0,0 +1,309 @@
+
+#include "pipe/p_debug.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_compiler.h"
+
+#include "draw/draw_vbuf.h"
+
+#include "nv04_context.h"
+#include "nv04_state.h"
+
+#define VERTEX_SIZE 40
+#define VERTEX_BUFFER_SIZE (4096*VERTEX_SIZE) // 4096 vertices of 40 bytes each
+
+/**
+ * Primitive renderer for nv04.
+ */
+struct nv04_vbuf_render {
+	struct vbuf_render base;
+
+	struct nv04_context *nv04;   
+
+	/** Vertex buffer */
+	unsigned char* buffer;
+
+	/** Vertex size in bytes */
+	unsigned vertex_size;
+
+	/** Current primitive */
+	unsigned prim;
+};
+
+
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct nv04_vbuf_render *
+nv04_vbuf_render( struct vbuf_render *render )
+{
+	assert(render);
+	return (struct nv04_vbuf_render *)render;
+}
+
+
+static const struct vertex_info *
+nv04_vbuf_render_get_vertex_info( struct vbuf_render *render )
+{
+	struct nv04_vbuf_render *nv04_render = nv04_vbuf_render(render);
+	struct nv04_context *nv04 = nv04_render->nv04;
+	return &nv04->vertex_info;
+}
+
+
+static void *
+nv04_vbuf_render_allocate_vertices( struct vbuf_render *render,
+		ushort vertex_size,
+		ushort nr_vertices )
+{
+	struct nv04_vbuf_render *nv04_render = nv04_vbuf_render(render);
+
+	nv04_render->buffer = (unsigned char*) MALLOC(VERTEX_BUFFER_SIZE);
+	assert(!nv04_render->buffer);
+
+	return nv04_render->buffer;
+}
+
+
+static boolean 
+nv04_vbuf_render_set_primitive( struct vbuf_render *render, 
+		unsigned prim )
+{
+	struct nv04_vbuf_render *nv04_render = nv04_vbuf_render(render);
+
+	if (prim <= PIPE_PRIM_LINE_STRIP)
+		return FALSE;
+
+	nv04_render->prim = prim;
+	return TRUE;
+}
+
+static INLINE void nv04_2triangles(struct nv04_context* nv04, unsigned char* buffer, ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5)
+{
+	BEGIN_RING(fahrenheit,NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0xA),49);
+	OUT_RINGp(buffer + VERTEX_SIZE * v0,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v1,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v2,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v3,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v4,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v5,8);
+	OUT_RING(0xFEDCBA);
+}
+
+static INLINE void nv04_1triangle(struct nv04_context* nv04, unsigned char* buffer, ushort v0, ushort v1, ushort v2)
+{
+	BEGIN_RING(fahrenheit,NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0xD),25);
+	OUT_RINGp(buffer + VERTEX_SIZE * v0,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v1,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v2,8);
+	OUT_RING(0xFED);
+}
+
+static INLINE void nv04_1quad(struct nv04_context* nv04, unsigned char* buffer, ushort v0, ushort v1, ushort v2, ushort v3)
+{
+	BEGIN_RING(fahrenheit,NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0xC),33);
+	OUT_RINGp(buffer + VERTEX_SIZE * v0,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v1,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v2,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v3,8);
+	OUT_RING(0xFECEDC);
+}
+
+static void nv04_vbuf_render_triangles_elts(struct nv04_vbuf_render * render, const ushort * indices, uint nr_indices)
+{
+	unsigned char* buffer = render->buffer;
+	struct nv04_context* nv04 = render->nv04;
+	int i;
+
+	for( i=0; i< nr_indices-5; i+=6)
+		nv04_2triangles(nv04,
+				buffer,
+				indices[i+0],
+				indices[i+1],
+				indices[i+2],
+				indices[i+3],
+				indices[i+4],
+				indices[i+5]
+			       );
+	if (i != nr_indices)
+	{
+		nv04_1triangle(nv04,
+				buffer,
+				indices[i+0],
+				indices[i+1],
+				indices[i+2]
+			       );
+		i+=3;
+	}
+	if (i != nr_indices)
+		NOUVEAU_ERR("Houston, we have lost some vertices\n");
+}
+
+static void nv04_vbuf_render_tri_strip_elts(struct nv04_vbuf_render* render, const ushort* indices, uint nr_indices)
+{
+	const uint32_t striptbl[]={0x321210,0x543432,0x765654,0x987876,0xBA9A98,0xDCBCBA,0xFEDEDC};
+	unsigned char* buffer = render->buffer;
+	struct nv04_context* nv04 = render->nv04;
+	int i,j;
+
+	for(i = 0; i<nr_indices; i+=14) 
+	{
+		int numvert = MIN2(16, nr_indices - i);
+		int numtri = numvert - 2;
+		if (numvert<3)
+			break;
+
+		BEGIN_RING( fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0x0), numvert*8 );
+		for(j = 0; j<numvert; j++)
+			OUT_RINGp( buffer + VERTEX_SIZE * indices [i+j], 8 );
+
+		BEGIN_RING_NI( fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE(0), (numtri+1)/2 );
+		for(j = 0; j<numtri/2; j++ )
+			OUT_RING(striptbl[j]);
+		if (numtri%2)
+			OUT_RING(striptbl[numtri/2]&0xFFF);
+	}
+}
+
+static void nv04_vbuf_render_tri_fan_elts(struct nv04_vbuf_render* render, const ushort* indices, uint nr_indices)
+{
+	const uint32_t fantbl[]={0x320210,0x540430,0x760650,0x980870,0xBA0A90,0xDC0CB0,0xFE0ED0};
+	unsigned char* buffer = render->buffer;
+	struct nv04_context* nv04 = render->nv04;
+	int i,j;
+
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0x0), 8);
+	OUT_RINGp(buffer + VERTEX_SIZE * indices[0], 8);
+
+	for(i = 1; i<nr_indices; i+=14)
+	{
+		int numvert=MIN2(15, nr_indices - i);
+		int numtri=numvert-2;
+		if (numvert < 3)
+			break;
+
+		BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0x1), numvert*8);
+
+		for(j=0;j<numvert;j++)
+			OUT_RINGp( buffer + VERTEX_SIZE * indices[ i+j ], 8 );
+
+		BEGIN_RING_NI(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE(0), (numtri+1)/2);
+		for(j = 0; j<numtri/2; j++)
+			OUT_RING(fantbl[j]);
+		if (numtri%2)
+			OUT_RING(fantbl[numtri/2]&0xFFF);
+	}
+}
+
+static void nv04_vbuf_render_quads_elts(struct nv04_vbuf_render* render, const ushort* indices, uint nr_indices)
+{
+	unsigned char* buffer = render->buffer;
+	struct nv04_context* nv04 = render->nv04;
+	int i;
+
+	for(i = 0; i < nr_indices; i += 4)
+		nv04_1quad(nv04,
+				buffer,
+				indices[i+0],
+				indices[i+1],
+				indices[i+2],
+				indices[i+3]
+			       );
+}
+
+
+static void 
+nv04_vbuf_render_draw( struct vbuf_render *render,
+		const ushort *indices,
+		uint nr_indices)
+{
+	struct nv04_vbuf_render *nv04_render = nv04_vbuf_render(render);
+
+	// emit the indices
+	switch( nv04_render->prim )
+	{
+		case PIPE_PRIM_TRIANGLES:
+			nv04_vbuf_render_triangles_elts(nv04_render, indices, nr_indices);
+			break;
+		case PIPE_PRIM_QUAD_STRIP:
+		case PIPE_PRIM_TRIANGLE_STRIP:
+			nv04_vbuf_render_tri_strip_elts(nv04_render, indices, nr_indices);
+			break;
+		case PIPE_PRIM_TRIANGLE_FAN:
+		case PIPE_PRIM_POLYGON:
+			nv04_vbuf_render_tri_fan_elts(nv04_render, indices, nr_indices);
+			break;
+		case PIPE_PRIM_QUADS:
+			nv04_vbuf_render_quads_elts(nv04_render, indices, nr_indices);
+			break;
+		default:
+			NOUVEAU_ERR("You have to implement primitive %d, young padawan\n", nv04_render->prim);
+			break;
+	}
+}
+
+
+static void
+nv04_vbuf_render_release_vertices( struct vbuf_render *render,
+		void *vertices, 
+		unsigned vertex_size,
+		unsigned vertices_used )
+{
+	struct nv04_vbuf_render *nv04_render = nv04_vbuf_render(render);
+
+	free(nv04_render->buffer);
+	nv04_render->buffer = NULL;
+}
+
+
+static void
+nv04_vbuf_render_destroy( struct vbuf_render *render )
+{
+	struct nv04_vbuf_render *nv04_render = nv04_vbuf_render(render);
+	FREE(nv04_render);
+}
+
+
+/**
+ * Create a new primitive render.
+ */
+static struct vbuf_render *
+nv04_vbuf_render_create( struct nv04_context *nv04 )
+{
+	struct nv04_vbuf_render *nv04_render = CALLOC_STRUCT(nv04_vbuf_render);
+
+	nv04_render->nv04 = nv04;
+
+	nv04_render->base.max_vertex_buffer_bytes = VERTEX_BUFFER_SIZE;
+	nv04_render->base.max_indices = 65536; 
+	nv04_render->base.get_vertex_info = nv04_vbuf_render_get_vertex_info;
+	nv04_render->base.allocate_vertices = nv04_vbuf_render_allocate_vertices;
+	nv04_render->base.set_primitive = nv04_vbuf_render_set_primitive;
+	nv04_render->base.draw = nv04_vbuf_render_draw;
+	nv04_render->base.release_vertices = nv04_vbuf_render_release_vertices;
+	nv04_render->base.destroy = nv04_vbuf_render_destroy;
+
+	return &nv04_render->base;
+}
+
+
+/**
+ * Create a new primitive vbuf/render stage.
+ */
+struct draw_stage *nv04_draw_vbuf_stage( struct nv04_context *nv04 )
+{
+	struct vbuf_render *render;
+	struct draw_stage *stage;
+
+	render = nv04_vbuf_render_create(nv04);
+	if(!render)
+		return NULL;
+
+	stage = draw_vbuf_stage( nv04->draw, render );
+	if(!stage) {
+		render->destroy(render);
+		return NULL;
+	}
+
+	return stage;
+}
diff --git a/src/gallium/drivers/nv04/nv04_screen.c b/src/gallium/drivers/nv04/nv04_screen.c
new file mode 100644
index 00000000000..3966a29ffa9
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_screen.c
@@ -0,0 +1,212 @@
+#include "pipe/p_screen.h"
+
+#include "nv04_context.h"
+#include "nv04_screen.h"
+
+static const char *
+nv04_screen_get_name(struct pipe_screen *screen)
+{
+	struct nv04_screen *nv04screen = nv04_screen(screen);
+	struct nouveau_device *dev = nv04screen->nvws->channel->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nv04_screen_get_vendor(struct pipe_screen *screen)
+{
+	return "nouveau";
+}
+
+static int
+nv04_screen_get_param(struct pipe_screen *screen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 1;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 0;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 0;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 0;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 0;
+	case PIPE_CAP_POINT_SPRITE:
+		return 0;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 1;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 0;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 10;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv04_screen_get_paramf(struct pipe_screen *screen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 0.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 0.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 0.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 0.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static boolean
+nv04_screen_is_format_supported(struct pipe_screen *screen,
+				enum pipe_format format,
+				enum pipe_texture_target target,
+				unsigned tex_usage, unsigned geom_flags)
+{
+	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_Z16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_X8R8G8B8_UNORM:
+		case PIPE_FORMAT_A1R5G5B5_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static void *
+nv04_surface_map(struct pipe_screen *screen, struct pipe_surface *surface,
+		 unsigned flags )
+{
+	struct pipe_winsys *ws = screen->winsys;
+	void *map;
+
+	map = ws->buffer_map(ws, surface->buffer, flags);
+	if (!map)
+		return NULL;
+
+	return map + surface->offset;
+}
+
+static void
+nv04_surface_unmap(struct pipe_screen *screen, struct pipe_surface *surface)
+{
+	struct pipe_winsys *ws = screen->winsys;
+
+	ws->buffer_unmap(ws, surface->buffer);
+}
+
+static void
+nv04_screen_destroy(struct pipe_screen *pscreen)
+{
+	struct nv04_screen *screen = nv04_screen(pscreen);
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nvws->notifier_free(&screen->sync);
+	nvws->grobj_free(&screen->fahrenheit);
+
+	FREE(pscreen);
+}
+
+struct pipe_screen *
+nv04_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+{
+	struct nv04_screen *screen = CALLOC_STRUCT(nv04_screen);
+	unsigned fahrenheit_class = 0, sub3d_class = 0;
+	unsigned chipset = nvws->channel->device->chipset;
+	int ret;
+
+	if (!screen)
+		return NULL;
+	screen->nvws = nvws;
+
+	if (chipset>=0x20) {
+		fahrenheit_class = 0;
+		sub3d_class = 0;
+	} else if (chipset>=0x10) {
+		fahrenheit_class = NV10_DX5_TEXTURED_TRIANGLE;
+		sub3d_class = NV10_CONTEXT_SURFACES_3D;
+	} else {
+		fahrenheit_class=NV04_DX5_TEXTURED_TRIANGLE;
+		sub3d_class = NV04_CONTEXT_SURFACES_3D;
+	}
+
+	if (!fahrenheit_class) {
+		NOUVEAU_ERR("Unknown nv04 chipset: nv%02x\n", chipset);
+		return NULL;
+	}
+
+	/* 3D object */
+	ret = nvws->grobj_alloc(nvws, fahrenheit_class, &screen->fahrenheit);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return NULL;
+	}
+
+	/* 3D surface object */
+	ret = nvws->grobj_alloc(nvws, sub3d_class, &screen->context_surfaces_3d);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D surface object: %d\n", ret);
+		return NULL;
+	}
+
+	/* Notifier for sync purposes */
+	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv04_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	screen->pipe.winsys = ws;
+	screen->pipe.destroy = nv04_screen_destroy;
+
+	screen->pipe.get_name = nv04_screen_get_name;
+	screen->pipe.get_vendor = nv04_screen_get_vendor;
+	screen->pipe.get_param = nv04_screen_get_param;
+	screen->pipe.get_paramf = nv04_screen_get_paramf;
+
+	screen->pipe.is_format_supported = nv04_screen_is_format_supported;
+
+	screen->pipe.surface_map = nv04_surface_map;
+	screen->pipe.surface_unmap = nv04_surface_unmap;
+
+	nv04_screen_init_miptree_functions(&screen->pipe);
+
+	return &screen->pipe;
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_screen.h b/src/gallium/drivers/nv04/nv04_screen.h
new file mode 100644
index 00000000000..99a49cdf7a9
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_screen.h
@@ -0,0 +1,25 @@
+#ifndef __NV04_SCREEN_H__
+#define __NV04_SCREEN_H__
+
+#include "pipe/p_screen.h"
+
+struct nv04_screen {
+	struct pipe_screen pipe;
+
+	struct nouveau_winsys *nvws;
+	unsigned chipset;
+
+	/* HW graphics objects */
+	struct nouveau_grobj *fahrenheit;
+	struct nouveau_grobj *context_surfaces_3d;
+	struct nouveau_notifier *sync;
+
+};
+
+static INLINE struct nv04_screen *
+nv04_screen(struct pipe_screen *screen)
+{
+	return (struct nv04_screen *)screen;
+}
+
+#endif
diff --git a/src/gallium/drivers/nv04/nv04_state.c b/src/gallium/drivers/nv04/nv04_state.c
new file mode 100644
index 00000000000..ff1933b5508
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_state.c
@@ -0,0 +1,495 @@
+#include "draw/draw_context.h"
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv04_context.h"
+#include "nv04_state.h"
+
+static void *
+nv04_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nv04_blend_state *cb;
+
+	cb = MALLOC(sizeof(struct nv04_blend_state));
+
+	cb->b_enable = cso->blend_enable ? 1 : 0;
+	cb->b_src = ((nvgl_blend_func(cso->alpha_src_factor)<<16) |
+			 (nvgl_blend_func(cso->rgb_src_factor)));
+	cb->b_dst = ((nvgl_blend_func(cso->alpha_dst_factor)<<16) |
+			 (nvgl_blend_func(cso->rgb_dst_factor)));
+	
+
+	return (void *)cb;
+}
+
+static void
+nv04_blend_state_bind(struct pipe_context *pipe, void *blend)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	nv04->blend = (struct nv04_blend_state*)blend;
+
+	nv04->dirty |= NV04_NEW_BLEND;
+}
+
+static void
+nv04_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+
+static INLINE unsigned
+wrap_mode(unsigned wrap) {
+	unsigned ret;
+
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_MIRRORED_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_CLAMP:
+		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP;
+	}
+	return ret >> NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_SHIFT;
+}
+
+static void *
+nv04_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+
+	struct nv04_sampler_state *ss;
+	uint32_t filter = 0;
+
+	ss = MALLOC(sizeof(struct nv04_sampler_state));
+
+	ss->format = ((wrap_mode(cso->wrap_s) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_SHIFT) |
+		    (wrap_mode(cso->wrap_t) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_SHIFT));
+
+	if (cso->max_anisotropy > 1.0) {
+		filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_ANISOTROPIC_MINIFY_ENABLE | NV04_DX5_TEXTURED_TRIANGLE_FILTER_ANISOTROPIC_MAGNIFY_ENABLE;
+	}
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST;
+			break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR;
+			break;
+		}
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST;
+		break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST;
+			break;
+		}
+		break;
+	}
+
+	ss->filter = filter;
+
+	return (void *)ss;
+}
+
+static void
+nv04_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv04->sampler[unit] = sampler[unit];
+		nv04->dirty_samplers |= (1 << unit);
+	}
+}
+
+static void
+nv04_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+static void
+nv04_set_sampler_texture(struct pipe_context *pipe, unsigned nr,
+			 struct pipe_texture **miptree)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv04->tex_miptree[unit] = (struct nv04_miptree *)miptree[unit];
+		nv04->dirty_samplers |= (1 << unit);
+	}
+}
+
+static void *
+nv04_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nv04_rasterizer_state *rs;
+
+	/*XXX: ignored:
+	 * 	scissor
+	 * 	points/lines (no hw support, emulated with tris in gallium)
+	 */
+	rs = MALLOC(sizeof(struct nv04_rasterizer_state));
+
+	rs->blend = cso->flatshade ? NV04_DX5_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_FLAT : NV04_DX5_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_GOURAUD;
+
+	return (void *)rs;
+}
+
+static void
+nv04_rasterizer_state_bind(struct pipe_context *pipe, void *rast)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	nv04->rast = (struct nv04_rasterizer_state*)rast;
+
+	draw_set_rasterizer_state(nv04->draw, (nv04->rast ? nv04->rast->templ : NULL));
+
+	nv04->dirty |= NV04_NEW_RAST | NV04_NEW_BLEND;
+}
+
+static void
+nv04_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+static INLINE uint32_t nv04_compare_func(uint32_t f)
+{
+	switch ( f ) {
+		case PIPE_FUNC_NEVER:		return 1;
+		case PIPE_FUNC_LESS:		return 2;
+		case PIPE_FUNC_EQUAL:		return 3;
+		case PIPE_FUNC_LEQUAL:		return 4;
+		case PIPE_FUNC_GREATER:		return 5;
+		case PIPE_FUNC_NOTEQUAL:	return 6;
+		case PIPE_FUNC_GEQUAL:		return 7;
+		case PIPE_FUNC_ALWAYS:		return 8;
+	}
+	NOUVEAU_MSG("Unable to find the function\n");
+	return 0;
+}
+
+static void *
+nv04_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nv04_depth_stencil_alpha_state *hw;
+
+	hw = MALLOC(sizeof(struct nv04_depth_stencil_alpha_state));
+
+	hw->control = float_to_ubyte(cso->alpha.ref);
+	hw->control |= ( nv04_compare_func(cso->alpha.func) << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ALPHA_FUNC_SHIFT );
+	hw->control |= cso->alpha.enabled ? NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ALPHA_TEST_ENABLE : 0;
+	hw->control |= NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ORIGIN;
+	hw->control |= cso->depth.enabled ? (1 << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_ENABLE_SHIFT) : 0;
+	hw->control |= ( nv04_compare_func(cso->depth.func)<< NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_FUNC_SHIFT );
+	hw->control |= 1 << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_CULL_MODE_SHIFT; // no culling, handled by the draw module
+	hw->control |= NV04_DX5_TEXTURED_TRIANGLE_CONTROL_DITHER_ENABLE;
+	hw->control |= NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_PERSPECTIVE_ENABLE;
+	hw->control |= cso->depth.writemask ? (1 << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_WRITE_ENABLE_SHIFT) : 0;
+	hw->control |= 1 << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_FORMAT_SHIFT; // integer zbuffer format
+
+	return (void *)hw;
+}
+
+static void
+nv04_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	nv04->dsa = hwcso;
+	nv04->dirty |= NV04_NEW_CONTROL;
+}
+
+static void
+nv04_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+static void *
+nv04_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *templ)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	return draw_create_vertex_shader(nv04->draw, templ);
+}
+
+static void
+nv04_vp_state_bind(struct pipe_context *pipe, void *shader)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	draw_bind_vertex_shader(nv04->draw, (struct draw_vertex_shader *) shader);
+
+	nv04->dirty |= NV04_NEW_VERTPROG;
+}
+
+static void
+nv04_vp_state_delete(struct pipe_context *pipe, void *shader)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	draw_delete_vertex_shader(nv04->draw, (struct draw_vertex_shader *) shader);
+}
+
+static void *
+nv04_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv04_fragment_program *fp;
+
+	fp = CALLOC(1, sizeof(struct nv04_fragment_program));
+	fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+
+	return (void *)fp;
+}
+
+static void
+nv04_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	struct nv04_fragment_program *fp = hwcso;
+
+	nv04->fragprog.current = fp;
+	nv04->dirty |= NV04_NEW_FRAGPROG;
+}
+
+static void
+nv04_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	struct nv04_fragment_program *fp = hwcso;
+
+	nv04_fragprog_destroy(nv04, fp);
+	free((void*)fp->pipe.tokens);
+	free(fp);
+}
+
+static void
+nv04_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+}
+
+static void
+nv04_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+}
+
+static void
+nv04_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	if (shader == PIPE_SHADER_VERTEX) {
+		nv04->vertprog.constant_buf = buf->buffer;
+		nv04->dirty |= NV04_NEW_VERTPROG;
+	} else
+	if (shader == PIPE_SHADER_FRAGMENT) {
+		nv04->fragprog.constant_buf = buf->buffer;
+		nv04->dirty |= NV04_NEW_FRAGPROG;
+	}
+}
+
+static void
+nv04_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	struct pipe_surface *rt, *zeta;
+	uint32_t rt_format, w, h;
+	int colour_format = 0, zeta_format = 0;
+
+	w = fb->cbufs[0]->width;
+	h = fb->cbufs[0]->height;
+	colour_format = fb->cbufs[0]->format;
+	rt = fb->cbufs[0];
+
+	if (fb->zsbuf) {
+		if (colour_format) {
+			assert(w == fb->zsbuf->width);
+			assert(h == fb->zsbuf->height);
+		} else {
+			w = fb->zsbuf->width;
+			h = fb->zsbuf->height;
+		}
+
+		zeta_format = fb->zsbuf->format;
+		zeta = fb->zsbuf;
+	}
+
+	switch (colour_format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case 0:
+		rt_format = 0x108;
+		break;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		rt_format = 0x103;
+		break;
+	default:
+		assert(0);
+	}
+
+	BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_FORMAT, 1);
+	OUT_RING(rt_format);
+
+	/* FIXME pitches have to be aligned ! */
+	BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_PITCH, 2);
+	OUT_RING(rt->stride|(zeta->stride<<16));
+	OUT_RELOCl(rt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	if (fb->zsbuf) {
+		BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_OFFSET_ZETA, 1);
+		OUT_RELOCl(zeta->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	}
+}
+
+static void
+nv04_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	NOUVEAU_ERR("line stipple hahaha\n");
+}
+
+static void
+nv04_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+/*	struct nv04_context *nv04 = nv04_context(pipe);
+
+	// XXX
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_SCISSOR_HORIZ, 2);
+	OUT_RING  (((s->maxx - s->minx) << 16) | s->minx);
+	OUT_RING  (((s->maxy - s->miny) << 16) | s->miny);*/
+}
+
+static void
+nv04_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *viewport)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	nv04->viewport = *viewport;
+
+	draw_set_viewport_state(nv04->draw, &nv04->viewport);
+}
+
+static void
+nv04_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+		       const struct pipe_vertex_buffer *buffers)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	draw_flush(nv04->draw);
+
+	memcpy(nv04->vertex_buffer, buffers, count * sizeof(buffers[0]));
+	nv04->num_vertex_buffers = count;
+
+	draw_set_vertex_buffers(nv04->draw, count, buffers);
+}
+
+static void
+nv04_set_vertex_elements(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_element *elements)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	draw_flush(nv04->draw);
+
+	nv04->num_vertex_elements = count;
+	draw_set_vertex_elements(nv04->draw, count, elements);
+}
+
+void
+nv04_init_state_functions(struct nv04_context *nv04)
+{
+	nv04->pipe.create_blend_state = nv04_blend_state_create;
+	nv04->pipe.bind_blend_state = nv04_blend_state_bind;
+	nv04->pipe.delete_blend_state = nv04_blend_state_delete;
+
+	nv04->pipe.create_sampler_state = nv04_sampler_state_create;
+	nv04->pipe.bind_sampler_states = nv04_sampler_state_bind;
+	nv04->pipe.delete_sampler_state = nv04_sampler_state_delete;
+	nv04->pipe.set_sampler_textures = nv04_set_sampler_texture;
+
+	nv04->pipe.create_rasterizer_state = nv04_rasterizer_state_create;
+	nv04->pipe.bind_rasterizer_state = nv04_rasterizer_state_bind;
+	nv04->pipe.delete_rasterizer_state = nv04_rasterizer_state_delete;
+
+	nv04->pipe.create_depth_stencil_alpha_state = nv04_depth_stencil_alpha_state_create;
+	nv04->pipe.bind_depth_stencil_alpha_state = nv04_depth_stencil_alpha_state_bind;
+	nv04->pipe.delete_depth_stencil_alpha_state = nv04_depth_stencil_alpha_state_delete;
+
+	nv04->pipe.create_vs_state = nv04_vp_state_create;
+	nv04->pipe.bind_vs_state = nv04_vp_state_bind;
+	nv04->pipe.delete_vs_state = nv04_vp_state_delete;
+
+	nv04->pipe.create_fs_state = nv04_fp_state_create;
+	nv04->pipe.bind_fs_state = nv04_fp_state_bind;
+	nv04->pipe.delete_fs_state = nv04_fp_state_delete;
+
+	nv04->pipe.set_blend_color = nv04_set_blend_color;
+	nv04->pipe.set_clip_state = nv04_set_clip_state;
+	nv04->pipe.set_constant_buffer = nv04_set_constant_buffer;
+	nv04->pipe.set_framebuffer_state = nv04_set_framebuffer_state;
+	nv04->pipe.set_polygon_stipple = nv04_set_polygon_stipple;
+	nv04->pipe.set_scissor_state = nv04_set_scissor_state;
+	nv04->pipe.set_viewport_state = nv04_set_viewport_state;
+
+	nv04->pipe.set_vertex_buffers = nv04_set_vertex_buffers;
+	nv04->pipe.set_vertex_elements = nv04_set_vertex_elements;
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_state.h b/src/gallium/drivers/nv04/nv04_state.h
new file mode 100644
index 00000000000..399f750dbe7
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_state.h
@@ -0,0 +1,71 @@
+#ifndef __NV04_STATE_H__
+#define __NV04_STATE_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nv04_blend_state {
+	uint32_t b_enable;
+	uint32_t b_src;
+	uint32_t b_dst;
+};
+
+struct nv04_fragtex_state {
+	uint32_t format;
+};
+
+struct nv04_sampler_state {
+	uint32_t filter;
+	uint32_t format;
+};
+
+struct nv04_depth_stencil_alpha_state {
+	uint32_t control;
+};
+
+struct nv04_rasterizer_state {
+	uint32_t blend;
+
+	const struct pipe_rasterizer_state *templ;
+};
+
+struct nv04_miptree {
+	struct pipe_texture base;
+
+	struct pipe_buffer *buffer;
+	uint total_size;
+
+	struct {
+		uint pitch;
+		uint *image_offset;
+	} level[PIPE_MAX_TEXTURE_LEVELS];
+};
+
+struct nv04_fragment_program_data {
+	unsigned offset;
+	unsigned index;
+};
+
+struct nv04_fragment_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+
+	boolean translated;
+	boolean on_hw;
+	unsigned samplers;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct nv04_fragment_program_data *consts;
+	unsigned nr_consts;
+
+	struct pipe_buffer *buffer;
+
+	uint32_t fp_control;
+	uint32_t fp_reg_control;
+};
+
+
+
+#endif
diff --git a/src/gallium/drivers/nv04/nv04_state_emit.c b/src/gallium/drivers/nv04/nv04_state_emit.c
new file mode 100644
index 00000000000..0ad40a092ea
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_state_emit.c
@@ -0,0 +1,156 @@
+#include "nv04_context.h"
+#include "nv04_state.h"
+
+static void nv04_vertex_layout(struct pipe_context* pipe)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	struct nv04_fragment_program *fp = nv04->fragprog.current;
+	uint32_t src = 0;
+	int i;
+	struct vertex_info vinfo;
+
+	memset(&vinfo, 0, sizeof(vinfo));
+
+	for (i = 0; i < fp->info.num_inputs; i++) {
+		switch (i) {
+			case TGSI_SEMANTIC_POSITION:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src++);
+				break;
+			case TGSI_SEMANTIC_COLOR:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src++);
+				break;
+			default:
+			case TGSI_SEMANTIC_GENERIC:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE, src++);
+				break;
+			case TGSI_SEMANTIC_FOG:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE, src++);
+				break;
+		}
+	}
+	draw_compute_vertex_size(&vinfo);
+}
+
+static uint32_t nv04_blend_func(uint32_t f)
+{
+	switch ( f ) {
+		case PIPE_BLENDFACTOR_ZERO:			return 0x1;
+		case PIPE_BLENDFACTOR_ONE:			return 0x2;
+		case PIPE_BLENDFACTOR_SRC_COLOR:		return 0x3;
+		case PIPE_BLENDFACTOR_INV_SRC_COLOR:		return 0x4;
+		case PIPE_BLENDFACTOR_SRC_ALPHA:		return 0x5;
+		case PIPE_BLENDFACTOR_INV_SRC_ALPHA:		return 0x6;
+		case PIPE_BLENDFACTOR_DST_ALPHA:		return 0x7;
+		case PIPE_BLENDFACTOR_INV_DST_ALPHA:		return 0x8;
+		case PIPE_BLENDFACTOR_DST_COLOR:		return 0x9;
+		case PIPE_BLENDFACTOR_INV_DST_COLOR:		return 0xA;
+		case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:	return 0xB;
+	}
+	NOUVEAU_MSG("Unable to find the blend function 0x%x\n",f);
+	return 0;
+}
+
+static void nv04_emit_control(struct nv04_context* nv04)
+{
+	uint32_t control = nv04->dsa->control;
+
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_CONTROL, 1);
+	OUT_RING(control);
+}
+
+static void nv04_emit_blend(struct nv04_context* nv04)
+{
+	uint32_t blend;
+
+	blend=0x4; // texture MODULATE_ALPHA
+	blend|=0x20; // alpha is MSB
+	blend|=(2<<6); // flat shading
+	blend|=(1<<8); // persp correct
+	blend|=(0<<16); // no fog
+	blend|=(nv04->blend->b_enable<<20);
+	blend|=(nv04_blend_func(nv04->blend->b_src)<<24);
+	blend|=(nv04_blend_func(nv04->blend->b_dst)<<28);
+
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_BLEND, 1);
+	OUT_RING(blend);
+}
+
+static void nv04_emit_sampler(struct nv04_context *nv04, int unit)
+{
+	struct nv04_miptree *nv04mt = nv04->tex_miptree[unit];
+	struct pipe_texture *pt = &nv04mt->base;
+
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_OFFSET, 3);
+	OUT_RELOCl(nv04mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCd(nv04mt->buffer, (nv04->fragtex.format | nv04->sampler[unit]->format), NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+	OUT_RING(nv04->sampler[unit]->filter);
+}
+
+void
+nv04_emit_hw_state(struct nv04_context *nv04)
+{
+	int i;
+
+	if (nv04->dirty & NV04_NEW_VERTPROG) {
+		//nv04_vertprog_bind(nv04, nv04->vertprog.current);
+		nv04->dirty &= ~NV04_NEW_VERTPROG;
+	}
+
+	if (nv04->dirty & NV04_NEW_FRAGPROG) {
+		nv04_fragprog_bind(nv04, nv04->fragprog.current);
+		/*XXX: clear NV04_NEW_FRAGPROG if no new program uploaded */
+		nv04->dirty_samplers |= (1<<10);
+		nv04->dirty_samplers = 0;
+	}
+
+	if (nv04->dirty & NV04_NEW_CONTROL) {
+		nv04->dirty &= ~NV04_NEW_CONTROL;
+
+		BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_CONTROL, 1);
+		OUT_RING(nv04->dsa->control);
+	}
+
+	if (nv04->dirty & NV04_NEW_BLEND) {
+		nv04->dirty &= ~NV04_NEW_BLEND;
+
+		nv04_emit_blend(nv04);
+	}
+
+	if (nv04->dirty & NV04_NEW_SAMPLER) {
+		nv04->dirty &= ~NV04_NEW_SAMPLER;
+
+		nv04_emit_sampler(nv04, 0);
+	}
+
+	if (nv04->dirty & NV04_NEW_VIEWPORT) {
+		nv04->dirty &= ~NV04_NEW_VIEWPORT;
+//		nv04_state_emit_viewport(nv04);
+	}
+
+	/* Emit relocs for every referenced buffer.
+	 * This is to ensure the bufmgr has an accurate idea of how
+	 * the buffer is used.  This isn't very efficient, but we don't
+	 * seem to take a significant performance hit.  Will be improved
+	 * at some point.  Vertex arrays are emitted by nv04_vbo.c
+	 */
+
+	/* Render target */
+/*	BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_PITCH, 2);
+	OUT_RING(rt->stride|(zeta->stride<<16));
+	OUT_RELOCl(rt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	if (fb->zsbuf) {
+		BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_OFFSET_ZETA, 1);
+		OUT_RELOCl(zeta->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	}*/
+
+	/* Texture images */
+	for (i = 0; i < 1; i++) {
+		if (!(nv04->fp_samplers & (1 << i)))
+			continue;
+		struct nv04_miptree *nv04mt = nv04->tex_miptree[i];
+		BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_OFFSET, 2);
+		OUT_RELOCl(nv04mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+		OUT_RELOCd(nv04mt->buffer, (nv04->fragtex.format | nv04->sampler[i]->format), NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+	}
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_surface.c b/src/gallium/drivers/nv04/nv04_surface.c
new file mode 100644
index 00000000000..57039483c62
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_surface.c
@@ -0,0 +1,64 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "nv04_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
+#include "util/u_tile.h"
+
+static void
+nv04_surface_copy(struct pipe_context *pipe, unsigned do_flip,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	struct nouveau_winsys *nvws = nv04->nvws;
+
+	nvws->surface_copy(nvws, dest, destx, desty, src, srcx, srcy,
+			   width, height);
+}
+
+static void
+nv04_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	struct nouveau_winsys *nvws = nv04->nvws;
+
+	nvws->surface_fill(nvws, dest, destx, desty, width, height, value);
+}
+
+void
+nv04_init_surface_functions(struct nv04_context *nv04)
+{
+	nv04->pipe.surface_copy = nv04_surface_copy;
+	nv04->pipe.surface_fill = nv04_surface_fill;
+}
diff --git a/src/gallium/drivers/nv04/nv04_vbo.c b/src/gallium/drivers/nv04/nv04_vbo.c
new file mode 100644
index 00000000000..91f919d48ec
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_vbo.c
@@ -0,0 +1,71 @@
+#include "draw/draw_context.h"
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "nv04_context.h"
+#include "nv04_state.h"
+
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_pushbuf.h"
+
+boolean nv04_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *indexBuffer,
+                    unsigned indexSize,
+                    unsigned prim, unsigned start, unsigned count)
+{
+	struct nv04_context *nv04 = nv04_context( pipe );
+	struct draw_context *draw = nv04->draw;
+	unsigned i;
+
+	/*
+	 * Map vertex buffers
+	 */
+	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+		if (nv04->vertex_buffer[i].buffer) {
+			void *buf
+				= pipe->winsys->buffer_map(pipe->winsys,
+						nv04->vertex_buffer[i].buffer,
+						PIPE_BUFFER_USAGE_CPU_READ);
+			draw_set_mapped_vertex_buffer(draw, i, buf);
+		}
+	}
+	/* Map index buffer, if present */
+	if (indexBuffer) {
+		void *mapped_indexes
+			= pipe->winsys->buffer_map(pipe->winsys, indexBuffer,
+					PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_element_buffer(draw, indexSize, mapped_indexes);
+	}
+	else {
+		/* no index/element buffer */
+		draw_set_mapped_element_buffer(draw, 0, NULL);
+	}
+
+	/* draw! */
+	draw_arrays(nv04->draw, prim, start, count);
+
+	/*
+	 * unmap vertex/index buffers
+	 */
+	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+		if (nv04->vertex_buffer[i].buffer) {
+			pipe->winsys->buffer_unmap(pipe->winsys, nv04->vertex_buffer[i].buffer);
+			draw_set_mapped_vertex_buffer(draw, i, NULL);
+		}
+	}
+	if (indexBuffer) {
+		pipe->winsys->buffer_unmap(pipe->winsys, indexBuffer);
+		draw_set_mapped_element_buffer(draw, 0, NULL);
+	}
+
+	return TRUE;
+}
+
+boolean nv04_draw_arrays( struct pipe_context *pipe,
+				 unsigned prim, unsigned start, unsigned count)
+{
+	return nv04_draw_elements(pipe, NULL, 0, prim, start, count);
+}
+
+
+
diff --git a/src/gallium/drivers/nv10/Makefile b/src/gallium/drivers/nv10/Makefile
new file mode 100644
index 00000000000..4ba7ce586d6
--- /dev/null
+++ b/src/gallium/drivers/nv10/Makefile
@@ -0,0 +1,28 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv10
+
+DRIVER_SOURCES = \
+	nv10_clear.c \
+	nv10_context.c \
+	nv10_fragprog.c \
+	nv10_fragtex.c \
+	nv10_miptree.c \
+	nv10_prim_vbuf.c \
+	nv10_screen.c \
+	nv10_state.c \
+	nv10_state_emit.c \
+	nv10_surface.c \
+	nv10_vbo.c
+
+C_SOURCES = \
+	$(COMMON_SOURCES) \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/drivers/nv10/nv10_clear.c b/src/gallium/drivers/nv10/nv10_clear.c
new file mode 100644
index 00000000000..be7e09cf4b0
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_clear.c
@@ -0,0 +1,12 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv10_context.h"
+
+void
+nv10_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, clearValue);
+}
diff --git a/src/gallium/drivers/nv10/nv10_context.c b/src/gallium/drivers/nv10/nv10_context.c
new file mode 100644
index 00000000000..e9b61daae7f
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_context.c
@@ -0,0 +1,296 @@
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+
+#include "nv10_context.h"
+#include "nv10_screen.h"
+
+static void
+nv10_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	draw_flush(nv10->draw);
+
+	FIRE_RING(fence);
+}
+
+static void
+nv10_destroy(struct pipe_context *pipe)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	if (nv10->draw)
+		draw_destroy(nv10->draw);
+
+	FREE(nv10);
+}
+
+static void nv10_init_hwctx(struct nv10_context *nv10)
+{
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_winsys *nvws = screen->nvws;
+	int i;
+	float projectionmatrix[16];
+
+	BEGIN_RING(celsius, NV10TCL_DMA_NOTIFY, 1);
+	OUT_RING  (screen->sync->handle);
+	BEGIN_RING(celsius, NV10TCL_DMA_IN_MEMORY0, 2);
+	OUT_RING  (nvws->channel->vram->handle);
+	OUT_RING  (nvws->channel->gart->handle);
+	BEGIN_RING(celsius, NV10TCL_DMA_IN_MEMORY2, 2);
+	OUT_RING  (nvws->channel->vram->handle);
+	OUT_RING  (nvws->channel->vram->handle);
+
+	BEGIN_RING(celsius, NV10TCL_NOP, 1);
+	OUT_RING  (0);
+
+	BEGIN_RING(celsius, NV10TCL_RT_HORIZ, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+
+	BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(0), 1);
+	OUT_RING  ((0x7ff<<16)|0x800);
+	BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_VERT(0), 1);
+	OUT_RING  ((0x7ff<<16)|0x800);
+
+	for (i=1;i<8;i++) {
+		BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(i), 1);
+		OUT_RING  (0);
+		BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_VERT(i), 1);
+		OUT_RING  (0);
+	}
+
+	BEGIN_RING(celsius, 0x290, 1);
+	OUT_RING  ((0x10<<16)|1);
+	BEGIN_RING(celsius, 0x3f4, 1);
+	OUT_RING  (0);
+
+	BEGIN_RING(celsius, NV10TCL_NOP, 1);
+	OUT_RING  (0);
+
+	if (nv10->screen->celsius->grclass != NV10TCL) {
+		/* For nv11, nv17 */
+		BEGIN_RING(celsius, 0x120, 3);
+		OUT_RING  (0);
+		OUT_RING  (1);
+		OUT_RING  (2);
+
+		BEGIN_RING(celsius, NV10TCL_NOP, 1);
+		OUT_RING  (0);
+	}
+
+	BEGIN_RING(celsius, NV10TCL_NOP, 1);
+	OUT_RING  (0);
+
+	/* Set state */
+	BEGIN_RING(celsius, NV10TCL_FOG_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_FUNC, 2);
+	OUT_RING  (0x207);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_TX_ENABLE(0), 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+
+	BEGIN_RING(celsius, NV10TCL_RC_IN_ALPHA(0), 12);
+	OUT_RING  (0x30141010);
+	OUT_RING  (0);
+	OUT_RING  (0x20040000);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0x00000c00);
+	OUT_RING  (0);
+	OUT_RING  (0x00000c00);
+	OUT_RING  (0x18000000);
+	OUT_RING  (0x300e0300);
+	OUT_RING  (0x0c091c80);
+
+	BEGIN_RING(celsius, NV10TCL_BLEND_FUNC_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_DITHER_ENABLE, 2);
+	OUT_RING  (1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_LINE_SMOOTH_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_VERTEX_WEIGHT_ENABLE, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_BLEND_FUNC_SRC, 4);
+	OUT_RING  (1);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0x8006);
+	BEGIN_RING(celsius, NV10TCL_STENCIL_MASK, 8);
+	OUT_RING  (0xff);
+	OUT_RING  (0x207);
+	OUT_RING  (0);
+	OUT_RING  (0xff);
+	OUT_RING  (0x1e00);
+	OUT_RING  (0x1e00);
+	OUT_RING  (0x1e00);
+	OUT_RING  (0x1d01);
+	BEGIN_RING(celsius, NV10TCL_NORMALIZE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_FOG_ENABLE, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_LIGHT_MODEL, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_COLOR_CONTROL, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_ENABLED_LIGHTS, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_DEPTH_FUNC, 1);
+	OUT_RING  (0x201);
+	BEGIN_RING(celsius, NV10TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_POLYGON_OFFSET_FACTOR, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_POINT_SIZE, 1);
+	OUT_RING  (8);
+	BEGIN_RING(celsius, NV10TCL_POINT_PARAMETERS_ENABLE, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_LINE_WIDTH, 1);
+	OUT_RING  (8);
+	BEGIN_RING(celsius, NV10TCL_LINE_SMOOTH_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (0x1b02);
+	OUT_RING  (0x1b02);
+	BEGIN_RING(celsius, NV10TCL_CULL_FACE, 2);
+	OUT_RING  (0x405);
+	OUT_RING  (0x901);
+	BEGIN_RING(celsius, NV10TCL_POLYGON_SMOOTH_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_CLIP_PLANE_ENABLE(0), 8);
+	for (i=0;i<8;i++) {
+		OUT_RING  (0);
+	}
+	BEGIN_RING(celsius, NV10TCL_FOG_EQUATION_CONSTANT, 3);
+	OUT_RING  (0x3fc00000);	/* -1.50 */
+	OUT_RING  (0xbdb8aa0a);	/* -0.09 */
+	OUT_RING  (0);		/*  0.00 */
+
+	BEGIN_RING(celsius, NV10TCL_NOP, 1);
+	OUT_RING  (0);
+
+	BEGIN_RING(celsius, NV10TCL_FOG_MODE, 2);
+	OUT_RING  (0x802);
+	OUT_RING  (2);
+	/* for some reason VIEW_MATRIX_ENABLE need to be 6 instead of 4 when
+	 * using texturing, except when using the texture matrix
+	 */
+	BEGIN_RING(celsius, NV10TCL_VIEW_MATRIX_ENABLE, 1);
+	OUT_RING  (6);
+	BEGIN_RING(celsius, NV10TCL_COLOR_MASK, 1);
+	OUT_RING  (0x01010101);
+
+	/* Set vertex component */
+	BEGIN_RING(celsius, NV10TCL_VERTEX_COL_4F_R, 4);
+	OUT_RINGf (1.0);
+	OUT_RINGf (1.0);
+	OUT_RINGf (1.0);
+	OUT_RINGf (1.0);
+	BEGIN_RING(celsius, NV10TCL_VERTEX_COL2_3F_R, 3);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_VERTEX_NOR_3F_X, 3);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RINGf (1.0);
+	BEGIN_RING(celsius, NV10TCL_VERTEX_TX0_4F_S, 4);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (1.0);
+	BEGIN_RING(celsius, NV10TCL_VERTEX_TX1_4F_S, 4);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (1.0);
+	BEGIN_RING(celsius, NV10TCL_VERTEX_FOG_1F, 1);
+	OUT_RINGf (0.0);
+	BEGIN_RING(celsius, NV10TCL_EDGEFLAG_ENABLE, 1);
+	OUT_RING  (1);
+
+	memset(projectionmatrix, 0, sizeof(projectionmatrix));
+	BEGIN_RING(celsius, NV10TCL_PROJECTION_MATRIX(0), 16);
+	projectionmatrix[0*4+0] = 1.0;
+	projectionmatrix[1*4+1] = 1.0;
+	projectionmatrix[2*4+2] = 1.0;
+	projectionmatrix[3*4+3] = 1.0;
+	for (i=0;i<16;i++) {
+		OUT_RINGf  (projectionmatrix[i]);
+	}
+
+	BEGIN_RING(celsius, NV10TCL_DEPTH_RANGE_NEAR, 2);
+	OUT_RING  (0.0);
+	OUT_RINGf  (16777216.0);
+
+	BEGIN_RING(celsius, NV10TCL_VIEWPORT_SCALE_X, 4);
+	OUT_RINGf  (-2048.0);
+	OUT_RINGf  (-2048.0);
+	OUT_RINGf  (16777215.0 * 0.5);
+	OUT_RING  (0);
+
+	FIRE_RING (NULL);
+}
+
+static void
+nv10_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
+{
+}
+
+struct pipe_context *
+nv10_create(struct pipe_screen *pscreen, unsigned pctx_id)
+{
+	struct nv10_screen *screen = nv10_screen(pscreen);
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv10_context *nv10;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nv10 = CALLOC(1, sizeof(struct nv10_context));
+	if (!nv10)
+		return NULL;
+	nv10->screen = screen;
+	nv10->pctx_id = pctx_id;
+
+	nv10->nvws = nvws;
+
+	nv10->pipe.winsys = ws;
+	nv10->pipe.screen = pscreen;
+	nv10->pipe.destroy = nv10_destroy;
+	nv10->pipe.set_edgeflags = nv10_set_edgeflags;
+	nv10->pipe.draw_arrays = nv10_draw_arrays;
+	nv10->pipe.draw_elements = nv10_draw_elements;
+	nv10->pipe.clear = nv10_clear;
+	nv10->pipe.flush = nv10_flush;
+
+	nv10_init_surface_functions(nv10);
+	nv10_init_state_functions(nv10);
+
+	nv10->draw = draw_create();
+	assert(nv10->draw);
+	draw_set_rasterize_stage(nv10->draw, nv10_draw_vbuf_stage(nv10));
+
+	nv10_init_hwctx(nv10);
+
+	return &nv10->pipe;
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_context.h b/src/gallium/drivers/nv10/nv10_context.h
new file mode 100644
index 00000000000..f3b56de25a7
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_context.h
@@ -0,0 +1,153 @@
+#ifndef __NV10_CONTEXT_H__
+#define __NV10_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+
+#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
+	struct nv10_screen *ctx = nv10->screen
+#include "nouveau/nouveau_push.h"
+
+#include "nv10_state.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+#define NV10_NEW_VERTPROG	(1 << 0)
+#define NV10_NEW_FRAGPROG	(1 << 1)
+#define NV10_NEW_VTXARRAYS	(1 << 2)
+#define NV10_NEW_BLEND		(1 << 3)
+#define NV10_NEW_BLENDCOL	(1 << 4)
+#define NV10_NEW_RAST 		(1 << 5)
+#define NV10_NEW_DSA  		(1 << 6)
+#define NV10_NEW_VIEWPORT	(1 << 7)
+#define NV10_NEW_SCISSOR	(1 << 8)
+#define NV10_NEW_FRAMEBUFFER	(1 << 9)
+
+#include "nv10_screen.h"
+
+struct nv10_context {
+	struct pipe_context pipe;
+
+	struct nouveau_winsys *nvws;
+	struct nv10_screen *screen;
+	unsigned pctx_id;
+
+	struct draw_context *draw;
+
+	uint32_t dirty;
+
+	struct nv10_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
+	struct nv10_miptree *tex_miptree[PIPE_MAX_SAMPLERS];
+	unsigned dirty_samplers;
+	unsigned fp_samplers;
+	unsigned vp_samplers;
+
+	uint32_t rt_enable;
+	struct pipe_buffer *rt[4];
+	struct pipe_buffer *zeta;
+	uint32_t lma_offset;
+
+	struct nv10_blend_state *blend;
+	struct pipe_blend_color *blend_color;
+	struct nv10_rasterizer_state *rast;
+	struct nv10_depth_stencil_alpha_state *dsa;
+	struct pipe_viewport_state *viewport;
+	struct pipe_scissor_state *scissor;
+	struct pipe_framebuffer_state *framebuffer;
+
+	//struct pipe_buffer *constbuf[PIPE_SHADER_TYPES];
+	float *constbuf[PIPE_SHADER_TYPES][32][4];
+	unsigned constbuf_nr[PIPE_SHADER_TYPES];
+
+	struct vertex_info vertex_info;
+
+	struct {
+		struct pipe_buffer *buffer;
+		uint32_t format;
+	} tex[2];
+
+	unsigned vb_enable;
+	struct {
+		struct pipe_buffer *buffer;
+		unsigned delta;
+	} vb[16];
+
+/*	struct {
+	
+		struct nouveau_resource *exec_heap;
+		struct nouveau_resource *data_heap;
+
+		struct nv10_vertex_program *active;
+
+		struct nv10_vertex_program *current;
+	} vertprog;
+*/
+	struct {
+		struct nv10_fragment_program *active;
+
+		struct nv10_fragment_program *current;
+		struct pipe_buffer *constant_buf;
+	} fragprog;
+
+	struct pipe_vertex_buffer  vtxbuf[PIPE_MAX_ATTRIBS];
+	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
+};
+
+static INLINE struct nv10_context *
+nv10_context(struct pipe_context *pipe)
+{
+	return (struct nv10_context *)pipe;
+}
+
+extern void nv10_init_state_functions(struct nv10_context *nv10);
+extern void nv10_init_surface_functions(struct nv10_context *nv10);
+
+extern void nv10_screen_init_miptree_functions(struct pipe_screen *pscreen);
+
+/* nv10_clear.c */
+extern void nv10_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+/* nv10_draw.c */
+extern struct draw_stage *nv10_draw_render_stage(struct nv10_context *nv10);
+
+/* nv10_fragprog.c */
+extern void nv10_fragprog_bind(struct nv10_context *,
+			       struct nv10_fragment_program *);
+extern void nv10_fragprog_destroy(struct nv10_context *,
+				  struct nv10_fragment_program *);
+
+/* nv10_fragtex.c */
+extern void nv10_fragtex_bind(struct nv10_context *);
+
+/* nv10_prim_vbuf.c */
+struct draw_stage *nv10_draw_vbuf_stage( struct nv10_context *nv10 );
+extern void nv10_vtxbuf_bind(struct nv10_context* nv10);
+
+/* nv10_state.c and friends */
+extern void nv10_emit_hw_state(struct nv10_context *nv10);
+extern void nv10_state_tex_update(struct nv10_context *nv10);
+
+/* nv10_vbo.c */
+extern boolean nv10_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv10_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *indexBuffer,
+                    unsigned indexSize,
+                    unsigned prim, unsigned start, unsigned count);
+
+
+#endif
diff --git a/src/gallium/drivers/nv10/nv10_fragprog.c b/src/gallium/drivers/nv10/nv10_fragprog.c
new file mode 100644
index 00000000000..698db5a16a9
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_fragprog.c
@@ -0,0 +1,21 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv10_context.h"
+
+void
+nv10_fragprog_bind(struct nv10_context *nv10, struct nv10_fragment_program *fp)
+{
+}
+
+void
+nv10_fragprog_destroy(struct nv10_context *nv10,
+		      struct nv10_fragment_program *fp)
+{
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_fragtex.c b/src/gallium/drivers/nv10/nv10_fragtex.c
new file mode 100644
index 00000000000..238634d0bb4
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_fragtex.c
@@ -0,0 +1,149 @@
+#include "nv10_context.h"
+
+static INLINE int log2i(int i)
+{
+	int r = 0;
+
+	if (i & 0xffff0000) {
+		i >>= 16;
+		r += 16;
+	}
+	if (i & 0x0000ff00) {
+		i >>= 8;
+		r += 8;
+	}
+	if (i & 0x000000f0) {
+		i >>= 4;
+		r += 4;
+	}
+	if (i & 0x0000000c) {
+		i >>= 2;
+		r += 2;
+	}
+	if (i & 0x00000002) {
+		r += 1;
+	}
+	return r;
+}
+
+#define _(m,tf)                                                                \
+{                                                                              \
+  TRUE,                                                                        \
+  PIPE_FORMAT_##m,                                                             \
+  NV10TCL_TX_FORMAT_FORMAT_##tf,                                               \
+}
+
+struct nv10_texture_format {
+	boolean defined;
+	uint	pipe;
+	int     format;
+};
+
+static struct nv10_texture_format
+nv10_texture_formats[] = {
+	_(A8R8G8B8_UNORM, A8R8G8B8),
+	_(A1R5G5B5_UNORM, A1R5G5B5),
+	_(A4R4G4B4_UNORM, A4R4G4B4),
+	_(L8_UNORM      , L8      ),
+	_(A8_UNORM      , A8      ),
+	_(A8L8_UNORM    , A8L8    ),
+//	_(RGB_DXT1      , DXT1,   ),
+//	_(RGBA_DXT1     , DXT1,   ),
+//	_(RGBA_DXT3     , DXT3,   ),
+//	_(RGBA_DXT5     , DXT5,   ),
+	{},
+};
+
+static struct nv10_texture_format *
+nv10_fragtex_format(uint pipe_format)
+{
+	struct nv10_texture_format *tf = nv10_texture_formats;
+
+	while (tf->defined) {
+		if (tf->pipe == pipe_format)
+			return tf;
+		tf++;
+	}
+
+	return NULL;
+}
+
+
+static void
+nv10_fragtex_build(struct nv10_context *nv10, int unit)
+{
+#if 0
+	struct nv10_sampler_state *ps = nv10->tex_sampler[unit];
+	struct nv10_miptree *nv10mt = nv10->tex_miptree[unit];
+	struct pipe_texture *pt = &nv10mt->base;
+	struct nv10_texture_format *tf;
+	uint32_t txf, txs, txp;
+
+	tf = nv10_fragtex_format(pt->format);
+	if (!tf || !tf->defined) {
+		NOUVEAU_ERR("Unsupported texture format: 0x%x\n", pt->format);
+		return;
+	}
+
+	txf  = tf->format << 8;
+	txf |= (pt->last_level + 1) << 16;
+	txf |= log2i(pt->width[0]) << 20;
+	txf |= log2i(pt->height[0]) << 24;
+	txf |= log2i(pt->depth[0]) << 28;
+	txf |= 8;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_CUBE:
+		txf |= NV10TCL_TX_FORMAT_CUBE_MAP;
+		/* fall-through */
+	case PIPE_TEXTURE_2D:
+		txf |= (2<<4);
+		break;
+	case PIPE_TEXTURE_1D:
+		txf |= (1<<4);
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return;
+	}
+
+	BEGIN_RING(celsius, NV10TCL_TX_OFFSET(unit), 8);
+	OUT_RELOCl(nv10mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCd(nv10mt->buffer,txf,NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+	OUT_RING  (ps->wrap);
+	OUT_RING  (0x40000000); /* enable */
+	OUT_RING  (txs);
+	OUT_RING  (ps->filt | 0x2000 /* magic */);
+	OUT_RING  ((pt->width[0] << 16) | pt->height[0]);
+	OUT_RING  (ps->bcol);
+#endif
+}
+
+void
+nv10_fragtex_bind(struct nv10_context *nv10)
+{
+#if 0
+	struct nv10_fragment_program *fp = nv10->fragprog.active;
+	unsigned samplers, unit;
+
+	samplers = nv10->fp_samplers & ~fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		BEGIN_RING(celsius, NV10TCL_TX_ENABLE(unit), 1);
+		OUT_RING  (0);
+	}
+
+	samplers = nv10->dirty_samplers & fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		nv10_fragtex_build(nv10, unit);
+	}
+
+	nv10->fp_samplers = fp->samplers;
+#endif
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_miptree.c b/src/gallium/drivers/nv10/nv10_miptree.c
new file mode 100644
index 00000000000..943f9e21e98
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_miptree.c
@@ -0,0 +1,149 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "nv10_context.h"
+#include "nv10_screen.h"
+
+static void
+nv10_miptree_layout(struct nv10_miptree *nv10mt)
+{
+	struct pipe_texture *pt = &nv10mt->base;
+	boolean swizzled = FALSE;
+	uint width = pt->width[0], height = pt->height[0];
+	uint offset = 0;
+	int nr_faces, l, f;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		nr_faces = 6;
+	} else {
+		nr_faces = 1;
+	}
+	
+	for (l = 0; l <= pt->last_level; l++) {
+		pt->width[l] = width;
+		pt->height[l] = height;
+		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
+		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
+
+		if (swizzled)
+			nv10mt->level[l].pitch = pt->nblocksx[l] * pt->block.size;
+		else
+			nv10mt->level[l].pitch = pt->nblocksx[0] * pt->block.size;
+		nv10mt->level[l].pitch = (nv10mt->level[l].pitch + 63) & ~63;
+
+		nv10mt->level[l].image_offset =
+			CALLOC(nr_faces, sizeof(unsigned));
+
+		width  = MAX2(1, width  >> 1);
+		height = MAX2(1, height >> 1);
+
+	}
+
+	for (f = 0; f < nr_faces; f++) {
+		for (l = 0; l <= pt->last_level; l++) {
+			nv10mt->level[l].image_offset[f] = offset;
+			offset += nv10mt->level[l].pitch * pt->height[l];
+		}
+	}
+
+	nv10mt->total_size = offset;
+}
+
+static struct pipe_texture *
+nv10_miptree_create(struct pipe_screen *screen, const struct pipe_texture *pt)
+{
+	struct pipe_winsys *ws = screen->winsys;
+	struct nv10_miptree *mt;
+
+	mt = MALLOC(sizeof(struct nv10_miptree));
+	if (!mt)
+		return NULL;
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = screen;
+
+	nv10_miptree_layout(mt);
+
+	mt->buffer = ws->buffer_create(ws, 256, PIPE_BUFFER_USAGE_PIXEL,
+					   mt->total_size);
+	if (!mt->buffer) {
+		FREE(mt);
+		return NULL;
+	}
+	
+	return &mt->base;
+}
+
+static void
+nv10_miptree_release(struct pipe_screen *screen, struct pipe_texture **pt)
+{
+	struct pipe_texture *mt = *pt;
+
+	*pt = NULL;
+	if (--mt->refcount <= 0) {
+		struct nv10_miptree *nv10mt = (struct nv10_miptree *)mt;
+		int l;
+
+		pipe_buffer_reference(screen, &nv10mt->buffer, NULL);
+		for (l = 0; l <= mt->last_level; l++) {
+			if (nv10mt->level[l].image_offset)
+				FREE(nv10mt->level[l].image_offset);
+		}
+		FREE(nv10mt);
+	}
+}
+
+static void
+nv10_miptree_update(struct pipe_context *pipe, struct pipe_texture *mt,
+		    uint face, uint levels)
+{
+}
+
+
+static struct pipe_surface *
+nv10_miptree_surface_get(struct pipe_screen *screen, struct pipe_texture *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct pipe_winsys *ws = screen->winsys;
+	struct nv10_miptree *nv10mt = (struct nv10_miptree *)pt;
+	struct pipe_surface *ps;
+
+	ps = ws->surface_alloc(ws);
+	if (!ps)
+		return NULL;
+	pipe_buffer_reference(screen, &ps->buffer, nv10mt->buffer);
+	ps->format = pt->format;
+	ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->block = pt->block;
+	ps->nblocksx = pt->nblocksx[level];
+	ps->nblocksy = pt->nblocksy[level];
+	ps->stride = nv10mt->level[level].pitch;
+	ps->refcount = 1;
+	ps->winsys = screen->winsys;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		ps->offset = nv10mt->level[level].image_offset[face];
+	} else {
+		ps->offset = nv10mt->level[level].image_offset[0];
+	}
+
+	return ps;
+}
+
+static void
+nv10_miptree_surface_release(struct pipe_screen *screen,
+			     struct pipe_surface **surface)
+{
+}
+
+void nv10_screen_init_miptree_functions(struct pipe_screen *pscreen)
+{
+	pscreen->texture_create = nv10_miptree_create;
+	pscreen->texture_release = nv10_miptree_release;
+	pscreen->get_tex_surface = nv10_miptree_surface_get;
+	pscreen->tex_surface_release = nv10_miptree_surface_release;
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_prim_vbuf.c b/src/gallium/drivers/nv10/nv10_prim_vbuf.c
new file mode 100644
index 00000000000..62a8f6d89da
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_prim_vbuf.c
@@ -0,0 +1,240 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \file
+ * Build post-transformation, post-clipping vertex buffers and element
+ * lists by hooking into the end of the primitive pipeline and
+ * manipulating the vertex_id field in the vertex headers.
+ *
+ * XXX: work in progress 
+ * 
+ * \author José Fonseca <[email protected]>
+ * \author Keith Whitwell <[email protected]>
+ */
+
+
+#include "pipe/p_debug.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_winsys.h"
+
+#include "nv10_context.h"
+#include "nv10_state.h"
+
+#include "draw/draw_vbuf.h"
+
+/**
+ * Primitive renderer for nv10.
+ */
+struct nv10_vbuf_render {
+	struct vbuf_render base;
+
+	struct nv10_context *nv10;   
+
+	/** Vertex buffer */
+	struct pipe_buffer* buffer;
+
+	/** Vertex size in bytes */
+	unsigned vertex_size;
+
+	/** Hardware primitive */
+	unsigned hwprim;
+};
+
+
+void nv10_vtxbuf_bind( struct nv10_context* nv10 )
+{
+	int i;
+	for(i = 0; i < 8; i++) {
+		BEGIN_RING(celsius, NV10TCL_VERTEX_ARRAY_ATTRIB_OFFSET(i), 1);
+		OUT_RING(0/*nv10->vtxbuf*/);
+		BEGIN_RING(celsius, NV10TCL_VERTEX_ARRAY_ATTRIB_FORMAT(i) ,1);
+		OUT_RING(0/*XXX*/);
+	}
+}
+
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct nv10_vbuf_render *
+nv10_vbuf_render( struct vbuf_render *render )
+{
+	assert(render);
+	return (struct nv10_vbuf_render *)render;
+}
+
+
+static const struct vertex_info *
+nv10_vbuf_render_get_vertex_info( struct vbuf_render *render )
+{
+	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
+	struct nv10_context *nv10 = nv10_render->nv10;
+
+	nv10_emit_hw_state(nv10);
+
+	return &nv10->vertex_info;
+}
+
+
+static void *
+nv10_vbuf_render_allocate_vertices( struct vbuf_render *render,
+		ushort vertex_size,
+		ushort nr_vertices )
+{
+	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
+	struct nv10_context *nv10 = nv10_render->nv10;
+	struct pipe_winsys *winsys = nv10->pipe.winsys;
+	size_t size = (size_t)vertex_size * (size_t)nr_vertices;
+
+	assert(!nv10_render->buffer);
+	nv10_render->buffer = winsys->buffer_create(winsys, 64, PIPE_BUFFER_USAGE_VERTEX, size);
+
+	nv10->dirty |= NV10_NEW_VTXARRAYS;
+
+	return winsys->buffer_map(winsys, 
+			nv10_render->buffer, 
+			PIPE_BUFFER_USAGE_CPU_WRITE);
+}
+
+
+static void 
+nv10_vbuf_render_set_primitive( struct vbuf_render *render, 
+		unsigned prim )
+{
+	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
+	nv10_render->hwprim = prim + 1;
+}
+
+
+static void 
+nv10_vbuf_render_draw( struct vbuf_render *render,
+		const ushort *indices,
+		uint nr_indices)
+{
+	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
+	struct nv10_context *nv10 = nv10_render->nv10;
+	int push, i;
+
+	nv10_emit_hw_state(nv10);
+
+	BEGIN_RING(celsius, NV10TCL_VERTEX_ARRAY_OFFSET_POS, 1);
+	OUT_RELOCl(nv10_render->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+
+	BEGIN_RING(celsius, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING(nv10_render->hwprim);
+
+	if (nr_indices & 1) {
+		BEGIN_RING(celsius, NV10TCL_VB_ELEMENT_U32, 1);
+		OUT_RING  (indices[0]);
+		indices++; nr_indices--;
+	}
+
+	while (nr_indices) {
+		// XXX too big/small ? check the size
+		push = MIN2(nr_indices, 1200 * 2);
+
+		BEGIN_RING_NI(celsius, NV10TCL_VB_ELEMENT_U16, push >> 1);
+		for (i = 0; i < push; i+=2)
+			OUT_RING((indices[i+1] << 16) | indices[i]);
+
+		nr_indices -= push;
+		indices  += push;
+	}
+
+	BEGIN_RING(celsius, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING  (0);
+}
+
+
+static void
+nv10_vbuf_render_release_vertices( struct vbuf_render *render,
+		void *vertices, 
+		unsigned vertex_size,
+		unsigned vertices_used )
+{
+	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
+	struct nv10_context *nv10 = nv10_render->nv10;
+	struct pipe_winsys *winsys = nv10->pipe.winsys;
+	struct pipe_screen *pscreen = &nv10->screen->pipe;
+
+	assert(nv10_render->buffer);
+	winsys->buffer_unmap(winsys, nv10_render->buffer);
+	pipe_buffer_reference(pscreen, &nv10_render->buffer, NULL);
+}
+
+
+static void
+nv10_vbuf_render_destroy( struct vbuf_render *render )
+{
+	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
+	FREE(nv10_render);
+}
+
+
+/**
+ * Create a new primitive render.
+ */
+static struct vbuf_render *
+nv10_vbuf_render_create( struct nv10_context *nv10 )
+{
+	struct nv10_vbuf_render *nv10_render = CALLOC_STRUCT(nv10_vbuf_render);
+
+	nv10_render->nv10 = nv10;
+
+	nv10_render->base.max_vertex_buffer_bytes = 16*1024;
+	nv10_render->base.max_indices = 1024;
+	nv10_render->base.get_vertex_info = nv10_vbuf_render_get_vertex_info;
+	nv10_render->base.allocate_vertices = nv10_vbuf_render_allocate_vertices;
+	nv10_render->base.set_primitive = nv10_vbuf_render_set_primitive;
+	nv10_render->base.draw = nv10_vbuf_render_draw;
+	nv10_render->base.release_vertices = nv10_vbuf_render_release_vertices;
+	nv10_render->base.destroy = nv10_vbuf_render_destroy;
+
+	return &nv10_render->base;
+}
+
+
+/**
+ * Create a new primitive vbuf/render stage.
+ */
+struct draw_stage *nv10_draw_vbuf_stage( struct nv10_context *nv10 )
+{
+	struct vbuf_render *render;
+	struct draw_stage *stage;
+
+	render = nv10_vbuf_render_create(nv10);
+	if(!render)
+		return NULL;
+
+	stage = draw_vbuf_stage( nv10->draw, render );
+	if(!stage) {
+		render->destroy(render);
+		return NULL;
+	}
+
+	return stage;
+}
diff --git a/src/gallium/drivers/nv10/nv10_screen.c b/src/gallium/drivers/nv10/nv10_screen.c
new file mode 100644
index 00000000000..27a9edf9bba
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_screen.c
@@ -0,0 +1,208 @@
+#include "pipe/p_screen.h"
+
+#include "nv10_context.h"
+#include "nv10_screen.h"
+
+static const char *
+nv10_screen_get_name(struct pipe_screen *screen)
+{
+	struct nv10_screen *nv10screen = nv10_screen(screen);
+	struct nouveau_device *dev = nv10screen->nvws->channel->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nv10_screen_get_vendor(struct pipe_screen *screen)
+{
+	return "nouveau";
+}
+
+static int
+nv10_screen_get_param(struct pipe_screen *screen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 2;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 0;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 0;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 0;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 1;
+	case PIPE_CAP_POINT_SPRITE:
+		return 0;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 1;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 0;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 12;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 12;
+	case NOUVEAU_CAP_HW_VTXBUF:
+	case NOUVEAU_CAP_HW_IDXBUF:
+		return 0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv10_screen_get_paramf(struct pipe_screen *screen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 10.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 64.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 2.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 4.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static boolean
+nv10_screen_is_format_supported(struct pipe_screen *screen,
+				enum pipe_format format,
+				enum pipe_texture_target target,
+				unsigned tex_usage, unsigned geom_flags)
+{
+	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_A1R5G5B5_UNORM:
+		case PIPE_FORMAT_A4R4G4B4_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static void *
+nv10_surface_map(struct pipe_screen *screen, struct pipe_surface *surface,
+		 unsigned flags )
+{
+	struct pipe_winsys *ws = screen->winsys;
+	void *map;
+
+	map = ws->buffer_map(ws, surface->buffer, flags);
+	if (!map)
+		return NULL;
+
+	return map + surface->offset;
+}
+
+static void
+nv10_surface_unmap(struct pipe_screen *screen, struct pipe_surface *surface)
+{
+	struct pipe_winsys *ws = screen->winsys;
+
+	ws->buffer_unmap(ws, surface->buffer);
+}
+
+static void
+nv10_screen_destroy(struct pipe_screen *pscreen)
+{
+	struct nv10_screen *screen = nv10_screen(pscreen);
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nvws->notifier_free(&screen->sync);
+	nvws->grobj_free(&screen->celsius);
+
+	FREE(pscreen);
+}
+
+struct pipe_screen *
+nv10_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+{
+	struct nv10_screen *screen = CALLOC_STRUCT(nv10_screen);
+	unsigned celsius_class;
+	unsigned chipset = nvws->channel->device->chipset;
+	int ret;
+
+	if (!screen)
+		return NULL;
+	screen->nvws = nvws;
+
+	/* 3D object */
+	if (chipset>=0x20)
+		celsius_class=NV11TCL;
+	else if (chipset>=0x17)
+		celsius_class=NV17TCL;
+	else if (chipset>=0x11)
+		celsius_class=NV11TCL;
+	else
+		celsius_class=NV10TCL;
+
+	if (!celsius_class) {
+		NOUVEAU_ERR("Unknown nv1x chipset: nv%02x\n", chipset);
+		return NULL;
+	}
+
+	ret = nvws->grobj_alloc(nvws, celsius_class, &screen->celsius);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return FALSE;
+	}
+
+	/* Notifier for sync purposes */
+	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv10_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	screen->pipe.winsys = ws;
+	screen->pipe.destroy = nv10_screen_destroy;
+
+	screen->pipe.get_name = nv10_screen_get_name;
+	screen->pipe.get_vendor = nv10_screen_get_vendor;
+	screen->pipe.get_param = nv10_screen_get_param;
+	screen->pipe.get_paramf = nv10_screen_get_paramf;
+
+	screen->pipe.is_format_supported = nv10_screen_is_format_supported;
+
+	screen->pipe.surface_map = nv10_surface_map;
+	screen->pipe.surface_unmap = nv10_surface_unmap;
+
+	nv10_screen_init_miptree_functions(&screen->pipe);
+
+	return &screen->pipe;
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_screen.h b/src/gallium/drivers/nv10/nv10_screen.h
new file mode 100644
index 00000000000..3f8750a13f7
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_screen.h
@@ -0,0 +1,22 @@
+#ifndef __NV10_SCREEN_H__
+#define __NV10_SCREEN_H__
+
+#include "pipe/p_screen.h"
+
+struct nv10_screen {
+	struct pipe_screen pipe;
+
+	struct nouveau_winsys *nvws;
+
+	/* HW graphics objects */
+	struct nouveau_grobj *celsius;
+	struct nouveau_notifier *sync;
+};
+
+static INLINE struct nv10_screen *
+nv10_screen(struct pipe_screen *screen)
+{
+	return (struct nv10_screen *)screen;
+}
+
+#endif
diff --git a/src/gallium/drivers/nv10/nv10_state.c b/src/gallium/drivers/nv10/nv10_state.c
new file mode 100644
index 00000000000..d2375aa2f64
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_state.c
@@ -0,0 +1,588 @@
+#include "draw/draw_context.h"
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv10_context.h"
+#include "nv10_state.h"
+
+static void *
+nv10_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nv10_blend_state *cb;
+
+	cb = MALLOC(sizeof(struct nv10_blend_state));
+
+	cb->b_enable = cso->blend_enable ? 1 : 0;
+	cb->b_srcfunc = ((nvgl_blend_func(cso->alpha_src_factor)<<16) |
+			 (nvgl_blend_func(cso->rgb_src_factor)));
+	cb->b_dstfunc = ((nvgl_blend_func(cso->alpha_dst_factor)<<16) |
+			 (nvgl_blend_func(cso->rgb_dst_factor)));
+
+	cb->c_mask = (((cso->colormask & PIPE_MASK_A) ? (0x01<<24) : 0) |
+		      ((cso->colormask & PIPE_MASK_R) ? (0x01<<16) : 0) |
+		      ((cso->colormask & PIPE_MASK_G) ? (0x01<< 8) : 0) |
+		      ((cso->colormask & PIPE_MASK_B) ? (0x01<< 0) : 0));
+
+	cb->d_enable = cso->dither ? 1 : 0;
+
+	return (void *)cb;
+}
+
+static void
+nv10_blend_state_bind(struct pipe_context *pipe, void *blend)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->blend = (struct nv10_blend_state*)blend;
+
+	nv10->dirty |= NV10_NEW_BLEND;
+}
+
+static void
+nv10_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+
+static INLINE unsigned
+wrap_mode(unsigned wrap) {
+	unsigned ret;
+
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		ret = NV10TCL_TX_FORMAT_WRAP_S_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		ret = NV10TCL_TX_FORMAT_WRAP_S_MIRRORED_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		ret = NV10TCL_TX_FORMAT_WRAP_S_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		ret = NV10TCL_TX_FORMAT_WRAP_S_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_CLAMP:
+		ret = NV10TCL_TX_FORMAT_WRAP_S_CLAMP;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		ret = NV10TCL_TX_FORMAT_WRAP_S_REPEAT;
+		break;
+	}
+
+	return ret >> NV10TCL_TX_FORMAT_WRAP_S_SHIFT;
+}
+
+static void *
+nv10_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	struct nv10_sampler_state *ps;
+	uint32_t filter = 0;
+
+	ps = MALLOC(sizeof(struct nv10_sampler_state));
+
+	ps->wrap = ((wrap_mode(cso->wrap_s) << NV10TCL_TX_FORMAT_WRAP_S_SHIFT) |
+		    (wrap_mode(cso->wrap_t) << NV10TCL_TX_FORMAT_WRAP_T_SHIFT));
+
+	ps->en = 0;
+	if (cso->max_anisotropy > 1.0) {
+		/* no idea, binary driver sets it, works without it.. meh.. */
+		ps->wrap |= (1 << 5);
+
+/*		if (cso->max_anisotropy >= 16.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_16X;
+		} else
+		if (cso->max_anisotropy >= 12.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_12X;
+		} else
+		if (cso->max_anisotropy >= 10.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_10X;
+		} else
+		if (cso->max_anisotropy >= 8.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_8X;
+		} else
+		if (cso->max_anisotropy >= 6.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_6X;
+		} else
+		if (cso->max_anisotropy >= 4.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_4X;
+		} else {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_2X;
+		}*/
+	}
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		filter |= NV10TCL_TX_FILTER_MAGNIFY_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		filter |= NV10TCL_TX_FILTER_MAGNIFY_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV10TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST;
+			break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV10TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV10TCL_TX_FILTER_MINIFY_LINEAR;
+			break;
+		}
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV10TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST;
+		break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV10TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV10TCL_TX_FILTER_MINIFY_NEAREST;
+			break;
+		}
+		break;
+	}
+
+	ps->filt = filter;
+
+/*	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+		switch (cso->compare_func) {
+		case PIPE_FUNC_NEVER:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_NEVER;
+			break;
+		case PIPE_FUNC_GREATER:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_GREATER;
+			break;
+		case PIPE_FUNC_EQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_EQUAL;
+			break;
+		case PIPE_FUNC_GEQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_GEQUAL;
+			break;
+		case PIPE_FUNC_LESS:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_LESS;
+			break;
+		case PIPE_FUNC_NOTEQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_NOTEQUAL;
+			break;
+		case PIPE_FUNC_LEQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_LEQUAL;
+			break;
+		case PIPE_FUNC_ALWAYS:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_ALWAYS;
+			break;
+		default:
+			break;
+		}
+	}*/
+
+	ps->bcol = ((float_to_ubyte(cso->border_color[3]) << 24) |
+		    (float_to_ubyte(cso->border_color[0]) << 16) |
+		    (float_to_ubyte(cso->border_color[1]) <<  8) |
+		    (float_to_ubyte(cso->border_color[2]) <<  0));
+
+	return (void *)ps;
+}
+
+static void
+nv10_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv10->tex_sampler[unit] = sampler[unit];
+		nv10->dirty_samplers |= (1 << unit);
+	}
+}
+
+static void
+nv10_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nv10_set_sampler_texture(struct pipe_context *pipe, unsigned nr,
+			 struct pipe_texture **miptree)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv10->tex_miptree[unit] = (struct nv10_miptree *)miptree[unit];
+		nv10->dirty_samplers |= (1 << unit);
+	}
+}
+
+static void *
+nv10_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nv10_rasterizer_state *rs;
+	int i;
+
+	/*XXX: ignored:
+	 * 	light_twoside
+	 * 	offset_cw/ccw -nohw
+	 * 	scissor
+	 * 	point_smooth -nohw
+	 * 	multisample
+	 * 	offset_units / offset_scale
+	 */
+	rs = MALLOC(sizeof(struct nv10_rasterizer_state));
+
+	rs->templ = cso;
+	
+	rs->shade_model = cso->flatshade ? 0x1d00 : 0x1d01;
+
+	rs->line_width = (unsigned char)(cso->line_width * 8.0) & 0xff;
+	rs->line_smooth_en = cso->line_smooth ? 1 : 0;
+
+	rs->point_size = *(uint32_t*)&cso->point_size;
+
+	rs->poly_smooth_en = cso->poly_smooth ? 1 : 0;
+
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		rs->front_face = NV10TCL_FRONT_FACE_CCW;
+		rs->poly_mode_front = nvgl_polygon_mode(cso->fill_ccw);
+		rs->poly_mode_back  = nvgl_polygon_mode(cso->fill_cw);
+	} else {
+		rs->front_face = NV10TCL_FRONT_FACE_CW;
+		rs->poly_mode_front = nvgl_polygon_mode(cso->fill_cw);
+		rs->poly_mode_back  = nvgl_polygon_mode(cso->fill_ccw);
+	}
+
+	switch (cso->cull_mode) {
+	case PIPE_WINDING_CCW:
+		rs->cull_face_en = 1;
+		if (cso->front_winding == PIPE_WINDING_CCW)
+			rs->cull_face    = NV10TCL_CULL_FACE_FRONT;
+		else
+			rs->cull_face    = NV10TCL_CULL_FACE_BACK;
+		break;
+	case PIPE_WINDING_CW:
+		rs->cull_face_en = 1;
+		if (cso->front_winding == PIPE_WINDING_CW)
+			rs->cull_face    = NV10TCL_CULL_FACE_FRONT;
+		else
+			rs->cull_face    = NV10TCL_CULL_FACE_BACK;
+		break;
+	case PIPE_WINDING_BOTH:
+		rs->cull_face_en = 1;
+		rs->cull_face    = NV10TCL_CULL_FACE_FRONT_AND_BACK;
+		break;
+	case PIPE_WINDING_NONE:
+	default:
+		rs->cull_face_en = 0;
+		rs->cull_face    = 0;
+		break;
+	}
+
+	if (cso->point_sprite) {
+		rs->point_sprite = (1 << 0);
+		for (i = 0; i < 8; i++) {
+			if (cso->sprite_coord_mode[i] != PIPE_SPRITE_COORD_NONE)
+				rs->point_sprite |= (1 << (8 + i));
+		}
+	} else {
+		rs->point_sprite = 0;
+	}
+
+	return (void *)rs;
+}
+
+static void
+nv10_rasterizer_state_bind(struct pipe_context *pipe, void *rast)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->rast = (struct nv10_rasterizer_state*)rast;
+
+	draw_set_rasterizer_state(nv10->draw, (nv10->rast ? nv10->rast->templ : NULL));
+
+	nv10->dirty |= NV10_NEW_RAST;
+}
+
+static void
+nv10_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void *
+nv10_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nv10_depth_stencil_alpha_state *hw;
+
+	hw = MALLOC(sizeof(struct nv10_depth_stencil_alpha_state));
+
+	hw->depth.func		= nvgl_comparison_op(cso->depth.func);
+	hw->depth.write_enable	= cso->depth.writemask ? 1 : 0;
+	hw->depth.test_enable	= cso->depth.enabled ? 1 : 0;
+
+	hw->stencil.enable = cso->stencil[0].enabled ? 1 : 0;
+	hw->stencil.wmask = cso->stencil[0].write_mask;
+	hw->stencil.func = nvgl_comparison_op(cso->stencil[0].func);
+	hw->stencil.ref	= cso->stencil[0].ref_value;
+	hw->stencil.vmask = cso->stencil[0].value_mask;
+	hw->stencil.fail = nvgl_stencil_op(cso->stencil[0].fail_op);
+	hw->stencil.zfail = nvgl_stencil_op(cso->stencil[0].zfail_op);
+	hw->stencil.zpass = nvgl_stencil_op(cso->stencil[0].zpass_op);
+
+	hw->alpha.enabled = cso->alpha.enabled ? 1 : 0;
+	hw->alpha.func = nvgl_comparison_op(cso->alpha.func);
+	hw->alpha.ref  = float_to_ubyte(cso->alpha.ref);
+
+	return (void *)hw;
+}
+
+static void
+nv10_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *dsa)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->dsa = (struct nv10_depth_stencil_alpha_state*)dsa;
+
+	nv10->dirty |= NV10_NEW_DSA;
+}
+
+static void
+nv10_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void *
+nv10_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *templ)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	return draw_create_vertex_shader(nv10->draw, templ);
+}
+
+static void
+nv10_vp_state_bind(struct pipe_context *pipe, void *shader)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	draw_bind_vertex_shader(nv10->draw, (struct draw_vertex_shader *) shader);
+
+	nv10->dirty |= NV10_NEW_VERTPROG;
+}
+
+static void
+nv10_vp_state_delete(struct pipe_context *pipe, void *shader)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	draw_delete_vertex_shader(nv10->draw, (struct draw_vertex_shader *) shader);
+}
+
+static void *
+nv10_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv10_fragment_program *fp;
+
+	fp = CALLOC(1, sizeof(struct nv10_fragment_program));
+	fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	
+	tgsi_scan_shader(cso->tokens, &fp->info);
+
+	return (void *)fp;
+}
+
+static void
+nv10_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	struct nv10_fragment_program *fp = hwcso;
+
+	nv10->fragprog.current = fp;
+	nv10->dirty |= NV10_NEW_FRAGPROG;
+}
+
+static void
+nv10_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	struct nv10_fragment_program *fp = hwcso;
+
+	nv10_fragprog_destroy(nv10, fp);
+	FREE((void*)fp->pipe.tokens);
+	FREE(fp);
+}
+
+static void
+nv10_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->blend_color = (struct pipe_blend_color*)bcol;
+
+	nv10->dirty |= NV10_NEW_BLENDCOL;
+}
+
+static void
+nv10_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	draw_set_clip_state(nv10->draw, clip);
+}
+
+static void
+nv10_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+
+	assert(shader < PIPE_SHADER_TYPES);
+	assert(index == 0);
+
+	if (buf) {
+		void *mapped;
+		if (buf->size && (mapped = ws->buffer_map(ws, buf->buffer, PIPE_BUFFER_USAGE_CPU_READ)))
+		{
+			memcpy(nv10->constbuf[shader], mapped, buf->size);
+			nv10->constbuf_nr[shader] =
+				buf->size / (4 * sizeof(float));
+			ws->buffer_unmap(ws, buf->buffer);
+		}
+	}
+}
+
+static void
+nv10_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->framebuffer = (struct pipe_framebuffer_state*)fb;
+
+	nv10->dirty |= NV10_NEW_FRAMEBUFFER;
+}
+
+static void
+nv10_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	NOUVEAU_ERR("line stipple hahaha\n");
+}
+
+static void
+nv10_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->scissor = (struct pipe_scissor_state*)s;
+
+	nv10->dirty |= NV10_NEW_SCISSOR;
+}
+
+static void
+nv10_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->viewport = (struct pipe_viewport_state*)vpt;
+
+	draw_set_viewport_state(nv10->draw, nv10->viewport);
+
+	nv10->dirty |= NV10_NEW_VIEWPORT;
+}
+
+static void
+nv10_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	memcpy(nv10->vtxbuf, vb, sizeof(*vb) * count);
+	nv10->dirty |= NV10_NEW_VTXARRAYS;
+
+	draw_set_vertex_buffers(nv10->draw, count, vb);
+}
+
+static void
+nv10_set_vertex_elements(struct pipe_context *pipe, unsigned count,
+			 const struct pipe_vertex_element *ve)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	memcpy(nv10->vtxelt, ve, sizeof(*ve) * count);
+	nv10->dirty |= NV10_NEW_VTXARRAYS;
+
+	draw_set_vertex_elements(nv10->draw, count, ve);
+}
+
+void
+nv10_init_state_functions(struct nv10_context *nv10)
+{
+	nv10->pipe.create_blend_state = nv10_blend_state_create;
+	nv10->pipe.bind_blend_state = nv10_blend_state_bind;
+	nv10->pipe.delete_blend_state = nv10_blend_state_delete;
+
+	nv10->pipe.create_sampler_state = nv10_sampler_state_create;
+	nv10->pipe.bind_sampler_states = nv10_sampler_state_bind;
+	nv10->pipe.delete_sampler_state = nv10_sampler_state_delete;
+	nv10->pipe.set_sampler_textures = nv10_set_sampler_texture;
+
+	nv10->pipe.create_rasterizer_state = nv10_rasterizer_state_create;
+	nv10->pipe.bind_rasterizer_state = nv10_rasterizer_state_bind;
+	nv10->pipe.delete_rasterizer_state = nv10_rasterizer_state_delete;
+
+	nv10->pipe.create_depth_stencil_alpha_state =
+		nv10_depth_stencil_alpha_state_create;
+	nv10->pipe.bind_depth_stencil_alpha_state =
+		nv10_depth_stencil_alpha_state_bind;
+	nv10->pipe.delete_depth_stencil_alpha_state =
+		nv10_depth_stencil_alpha_state_delete;
+
+	nv10->pipe.create_vs_state = nv10_vp_state_create;
+	nv10->pipe.bind_vs_state = nv10_vp_state_bind;
+	nv10->pipe.delete_vs_state = nv10_vp_state_delete;
+
+	nv10->pipe.create_fs_state = nv10_fp_state_create;
+	nv10->pipe.bind_fs_state = nv10_fp_state_bind;
+	nv10->pipe.delete_fs_state = nv10_fp_state_delete;
+
+	nv10->pipe.set_blend_color = nv10_set_blend_color;
+	nv10->pipe.set_clip_state = nv10_set_clip_state;
+	nv10->pipe.set_constant_buffer = nv10_set_constant_buffer;
+	nv10->pipe.set_framebuffer_state = nv10_set_framebuffer_state;
+	nv10->pipe.set_polygon_stipple = nv10_set_polygon_stipple;
+	nv10->pipe.set_scissor_state = nv10_set_scissor_state;
+	nv10->pipe.set_viewport_state = nv10_set_viewport_state;
+
+	nv10->pipe.set_vertex_buffers = nv10_set_vertex_buffers;
+	nv10->pipe.set_vertex_elements = nv10_set_vertex_elements;
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_state.h b/src/gallium/drivers/nv10/nv10_state.h
new file mode 100644
index 00000000000..3a3fd0d4f4f
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_state.h
@@ -0,0 +1,139 @@
+#ifndef __NV10_STATE_H__
+#define __NV10_STATE_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nv10_blend_state {
+	uint32_t b_enable;
+	uint32_t b_srcfunc;
+	uint32_t b_dstfunc;
+
+	uint32_t c_mask;
+
+	uint32_t d_enable;
+};
+
+struct nv10_sampler_state {
+	uint32_t wrap;
+	uint32_t en;
+	uint32_t filt;
+	uint32_t bcol;
+};
+
+struct nv10_rasterizer_state {
+	uint32_t shade_model;
+
+	uint32_t line_width;
+	uint32_t line_smooth_en;
+
+	uint32_t point_size;
+
+	uint32_t poly_smooth_en;
+	
+	uint32_t poly_mode_front;
+	uint32_t poly_mode_back;
+
+	uint32_t front_face;
+	uint32_t cull_face;
+	uint32_t cull_face_en;
+
+	uint32_t point_sprite;
+
+	const struct pipe_rasterizer_state *templ;
+};
+
+struct nv10_vertex_program_exec {
+	uint32_t data[4];
+	boolean has_branch_offset;
+	int const_index;
+};
+
+struct nv10_vertex_program_data {
+	int index; /* immediates == -1 */
+	float value[4];
+};
+
+struct nv10_vertex_program {
+	const struct pipe_shader_state *pipe;
+
+	boolean translated;
+	struct nv10_vertex_program_exec *insns;
+	unsigned nr_insns;
+	struct nv10_vertex_program_data *consts;
+	unsigned nr_consts;
+
+	struct nouveau_resource *exec;
+	unsigned exec_start;
+	struct nouveau_resource *data;
+	unsigned data_start;
+	unsigned data_start_min;
+
+	uint32_t ir;
+	uint32_t or;
+};
+
+struct nv10_fragment_program_data {
+	unsigned offset;
+	unsigned index;
+};
+
+struct nv10_fragment_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+
+	boolean translated;
+	boolean on_hw;
+	unsigned samplers;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct nv10_fragment_program_data *consts;
+	unsigned nr_consts;
+
+	struct pipe_buffer *buffer;
+
+	uint32_t fp_control;
+	uint32_t fp_reg_control;
+};
+
+
+struct nv10_depth_stencil_alpha_state {
+	struct {
+		uint32_t func;
+		uint32_t write_enable;
+		uint32_t test_enable;
+	} depth;
+
+	struct {
+		uint32_t enable;
+		uint32_t wmask;
+		uint32_t func;
+		uint32_t ref;
+		uint32_t vmask;
+		uint32_t fail;
+		uint32_t zfail;
+		uint32_t zpass;
+	} stencil;
+
+	struct {
+		uint32_t enabled;
+		uint32_t func;
+		uint32_t ref;
+	} alpha;
+};
+
+struct nv10_miptree {
+	struct pipe_texture base;
+
+	struct pipe_buffer *buffer;
+	uint total_size;
+
+	struct {
+		uint pitch;
+		uint *image_offset;
+	} level[PIPE_MAX_TEXTURE_LEVELS];
+};
+
+#endif
diff --git a/src/gallium/drivers/nv10/nv10_state_emit.c b/src/gallium/drivers/nv10/nv10_state_emit.c
new file mode 100644
index 00000000000..46c7e1d7536
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_state_emit.c
@@ -0,0 +1,303 @@
+#include "nv10_context.h"
+#include "nv10_state.h"
+
+static void nv10_state_emit_blend(struct nv10_context* nv10)
+{
+	struct nv10_blend_state *b = nv10->blend;
+
+	BEGIN_RING(celsius, NV10TCL_DITHER_ENABLE, 1);
+	OUT_RING  (b->d_enable);
+
+	BEGIN_RING(celsius, NV10TCL_BLEND_FUNC_ENABLE, 3);
+	OUT_RING  (b->b_enable);
+	OUT_RING  (b->b_srcfunc);
+	OUT_RING  (b->b_dstfunc);
+
+	BEGIN_RING(celsius, NV10TCL_COLOR_MASK, 1);
+	OUT_RING  (b->c_mask);
+}
+
+static void nv10_state_emit_blend_color(struct nv10_context* nv10)
+{
+	struct pipe_blend_color *c = nv10->blend_color;
+
+	BEGIN_RING(celsius, NV10TCL_BLEND_COLOR, 1);
+	OUT_RING  ((float_to_ubyte(c->color[3]) << 24)|
+		   (float_to_ubyte(c->color[0]) << 16)|
+		   (float_to_ubyte(c->color[1]) << 8) |
+		   (float_to_ubyte(c->color[2]) << 0));
+}
+
+static void nv10_state_emit_rast(struct nv10_context* nv10)
+{
+	struct nv10_rasterizer_state *r = nv10->rast;
+
+	BEGIN_RING(celsius, NV10TCL_SHADE_MODEL, 2);
+	OUT_RING  (r->shade_model);
+	OUT_RING  (r->line_width);
+
+
+	BEGIN_RING(celsius, NV10TCL_POINT_SIZE, 1);
+	OUT_RING  (r->point_size);
+
+	BEGIN_RING(celsius, NV10TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (r->poly_mode_front);
+	OUT_RING  (r->poly_mode_back);
+
+
+	BEGIN_RING(celsius, NV10TCL_CULL_FACE, 2);
+	OUT_RING  (r->cull_face);
+	OUT_RING  (r->front_face);
+
+	BEGIN_RING(celsius, NV10TCL_LINE_SMOOTH_ENABLE, 2);
+	OUT_RING  (r->line_smooth_en);
+	OUT_RING  (r->poly_smooth_en);
+
+	BEGIN_RING(celsius, NV10TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (r->cull_face_en);
+}
+
+static void nv10_state_emit_dsa(struct nv10_context* nv10)
+{
+	struct nv10_depth_stencil_alpha_state *d = nv10->dsa;
+
+	BEGIN_RING(celsius, NV10TCL_DEPTH_FUNC, 1);
+	OUT_RING (d->depth.func);
+
+	BEGIN_RING(celsius, NV10TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING (d->depth.write_enable);
+
+	BEGIN_RING(celsius, NV10TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING (d->depth.test_enable);
+
+#if 0
+	BEGIN_RING(celsius, NV10TCL_STENCIL_ENABLE, 1);
+	OUT_RING (d->stencil.enable);
+	BEGIN_RING(celsius, NV10TCL_STENCIL_MASK, 7);
+	OUT_RINGp ((uint32_t *)&(d->stencil.wmask), 7);
+#endif
+
+	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING (d->alpha.enabled);
+
+	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_FUNC, 1);
+	OUT_RING (d->alpha.func);
+
+	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_REF, 1);
+	OUT_RING (d->alpha.ref);
+}
+
+static void nv10_state_emit_viewport(struct nv10_context* nv10)
+{
+}
+
+static void nv10_state_emit_scissor(struct nv10_context* nv10)
+{
+	// XXX this is so not working
+/*	struct pipe_scissor_state *s = nv10->scissor;
+	BEGIN_RING(celsius, NV10TCL_SCISSOR_HORIZ, 2);
+	OUT_RING  (((s->maxx - s->minx) << 16) | s->minx);
+	OUT_RING  (((s->maxy - s->miny) << 16) | s->miny);*/
+}
+
+static void nv10_state_emit_framebuffer(struct nv10_context* nv10)
+{
+	struct pipe_framebuffer_state* fb = nv10->framebuffer;
+	struct pipe_surface *rt, *zeta = NULL;
+	uint32_t rt_format, w, h;
+	int colour_format = 0, zeta_format = 0;
+
+	w = fb->cbufs[0]->width;
+	h = fb->cbufs[0]->height;
+	colour_format = fb->cbufs[0]->format;
+	rt = fb->cbufs[0];
+
+	if (fb->zsbuf) {
+		if (colour_format) {
+			assert(w == fb->zsbuf->width);
+			assert(h == fb->zsbuf->height);
+		} else {
+			w = fb->zsbuf->width;
+			h = fb->zsbuf->height;
+		}
+
+		zeta_format = fb->zsbuf->format;
+		zeta = fb->zsbuf;
+	}
+
+	rt_format = NV10TCL_RT_FORMAT_TYPE_LINEAR;
+
+	switch (colour_format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case 0:
+		rt_format |= NV10TCL_RT_FORMAT_COLOR_A8R8G8B8;
+		break;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		rt_format |= NV10TCL_RT_FORMAT_COLOR_R5G6B5;
+		break;
+	default:
+		assert(0);
+	}
+
+	if (zeta) {
+		BEGIN_RING(celsius, NV10TCL_RT_PITCH, 1);
+		OUT_RING  (rt->stride | (zeta->stride << 16));
+	} else {
+		BEGIN_RING(celsius, NV10TCL_RT_PITCH, 1);
+		OUT_RING  (rt->stride | (rt->stride << 16));
+	}
+
+	nv10->rt[0] = rt->buffer;
+
+	if (zeta_format)
+	{
+		nv10->zeta = zeta->buffer;
+	}
+
+	BEGIN_RING(celsius, NV10TCL_RT_HORIZ, 3);
+	OUT_RING  ((w << 16) | 0);
+	OUT_RING  ((h << 16) | 0);
+	OUT_RING  (rt_format);
+	BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	OUT_RING  (((w - 1) << 16) | 0 | 0x08000800);
+	OUT_RING  (((h - 1) << 16) | 0 | 0x08000800);
+}
+
+static void nv10_vertex_layout(struct nv10_context *nv10)
+{
+	struct nv10_fragment_program *fp = nv10->fragprog.current;
+	uint32_t src = 0;
+	int i;
+	struct vertex_info vinfo;
+
+	memset(&vinfo, 0, sizeof(vinfo));
+
+	for (i = 0; i < fp->info.num_inputs; i++) {
+		switch (fp->info.input_semantic_name[i]) {
+			case TGSI_SEMANTIC_POSITION:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src++);
+				break;
+			case TGSI_SEMANTIC_COLOR:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src++);
+				break;
+			default:
+			case TGSI_SEMANTIC_GENERIC:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE, src++);
+				break;
+			case TGSI_SEMANTIC_FOG:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE, src++);
+				break;
+		}
+	}
+	draw_compute_vertex_size(&vinfo);
+}
+
+void
+nv10_emit_hw_state(struct nv10_context *nv10)
+{
+	int i;
+
+	if (nv10->dirty & NV10_NEW_VERTPROG) {
+		//nv10_vertprog_bind(nv10, nv10->vertprog.current);
+		nv10->dirty &= ~NV10_NEW_VERTPROG;
+	}
+
+	if (nv10->dirty & NV10_NEW_FRAGPROG) {
+		nv10_fragprog_bind(nv10, nv10->fragprog.current);
+		/*XXX: clear NV10_NEW_FRAGPROG if no new program uploaded */
+		nv10->dirty_samplers |= (1<<10);
+		nv10->dirty_samplers = 0;
+	}
+
+	if (nv10->dirty_samplers || (nv10->dirty & NV10_NEW_FRAGPROG)) {
+		nv10_fragtex_bind(nv10);
+		nv10->dirty &= ~NV10_NEW_FRAGPROG;
+	}
+
+	if (nv10->dirty & NV10_NEW_VTXARRAYS) {
+		nv10->dirty &= ~NV10_NEW_VTXARRAYS;
+		nv10_vertex_layout(nv10);
+		nv10_vtxbuf_bind(nv10);
+	}
+
+	if (nv10->dirty & NV10_NEW_BLEND) {
+		nv10->dirty &= ~NV10_NEW_BLEND;
+		nv10_state_emit_blend(nv10);
+	}
+
+	if (nv10->dirty & NV10_NEW_BLENDCOL) {
+		nv10->dirty &= ~NV10_NEW_BLENDCOL;
+		nv10_state_emit_blend_color(nv10);
+	}
+
+	if (nv10->dirty & NV10_NEW_RAST) {
+		nv10->dirty &= ~NV10_NEW_RAST;
+		nv10_state_emit_rast(nv10);
+	}
+
+	if (nv10->dirty & NV10_NEW_DSA) {
+		nv10->dirty &= ~NV10_NEW_DSA;
+		nv10_state_emit_dsa(nv10);
+	}
+
+ 	if (nv10->dirty & NV10_NEW_VIEWPORT) {
+		nv10->dirty &= ~NV10_NEW_VIEWPORT;
+		nv10_state_emit_viewport(nv10);
+	}
+
+ 	if (nv10->dirty & NV10_NEW_SCISSOR) {
+		nv10->dirty &= ~NV10_NEW_SCISSOR;
+		nv10_state_emit_scissor(nv10);
+	}
+
+ 	if (nv10->dirty & NV10_NEW_FRAMEBUFFER) {
+		nv10->dirty &= ~NV10_NEW_FRAMEBUFFER;
+		nv10_state_emit_framebuffer(nv10);
+	}
+
+	/* Emit relocs for every referenced buffer.
+	 * This is to ensure the bufmgr has an accurate idea of how
+	 * the buffer is used.  This isn't very efficient, but we don't
+	 * seem to take a significant performance hit.  Will be improved
+	 * at some point.  Vertex arrays are emitted by nv10_vbo.c
+	 */
+
+	/* Render target */
+// XXX figre out who's who for NV10TCL_DMA_* and fill accordingly
+//	BEGIN_RING(celsius, NV10TCL_DMA_COLOR0, 1);
+//	OUT_RELOCo(nv10->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(celsius, NV10TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(nv10->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	if (nv10->zeta) {
+// XXX
+//		BEGIN_RING(celsius, NV10TCL_DMA_ZETA, 1);
+//		OUT_RELOCo(nv10->zeta, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(celsius, NV10TCL_ZETA_OFFSET, 1);
+		OUT_RELOCl(nv10->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		/* XXX for when we allocate LMA on nv17 */
+/*		BEGIN_RING(celsius, NV10TCL_LMA_DEPTH_BUFFER_OFFSET, 1);
+		OUT_RELOCl(nv10->zeta + lma_offset);*/
+	}
+
+	/* Vertex buffer */
+	BEGIN_RING(celsius, NV10TCL_DMA_VTXBUF0, 1);
+	OUT_RELOCo(nv10->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(celsius, NV10TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(nv10->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	/* Texture images */
+	for (i = 0; i < 2; i++) {
+		if (!(nv10->fp_samplers & (1 << i)))
+			continue;
+		BEGIN_RING(celsius, NV10TCL_TX_OFFSET(i), 1);
+		OUT_RELOCl(nv10->tex[i].buffer, 0, NOUVEAU_BO_VRAM |
+			   NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+		BEGIN_RING(celsius, NV10TCL_TX_FORMAT(i), 1);
+		OUT_RELOCd(nv10->tex[i].buffer, nv10->tex[i].format,
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
+			   NOUVEAU_BO_OR, NV10TCL_TX_FORMAT_DMA0,
+			   NV10TCL_TX_FORMAT_DMA1);
+	}
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_surface.c b/src/gallium/drivers/nv10/nv10_surface.c
new file mode 100644
index 00000000000..875e4c58589
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_surface.c
@@ -0,0 +1,64 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "nv10_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
+#include "util/u_tile.h"
+
+static void
+nv10_surface_copy(struct pipe_context *pipe, unsigned do_flip,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	struct nouveau_winsys *nvws = nv10->nvws;
+
+	nvws->surface_copy(nvws, dest, destx, desty, src, srcx, srcy,
+			   width, height);
+}
+
+static void
+nv10_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	struct nouveau_winsys *nvws = nv10->nvws;
+
+	nvws->surface_fill(nvws, dest, destx, desty, width, height, value);
+}
+
+void
+nv10_init_surface_functions(struct nv10_context *nv10)
+{
+	nv10->pipe.surface_copy = nv10_surface_copy;
+	nv10->pipe.surface_fill = nv10_surface_fill;
+}
diff --git a/src/gallium/drivers/nv10/nv10_vbo.c b/src/gallium/drivers/nv10/nv10_vbo.c
new file mode 100644
index 00000000000..d0e788ac036
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_vbo.c
@@ -0,0 +1,77 @@
+#include "draw/draw_context.h"
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "nv10_context.h"
+#include "nv10_state.h"
+
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_pushbuf.h"
+
+boolean nv10_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *indexBuffer,
+                    unsigned indexSize,
+                    unsigned prim, unsigned start, unsigned count)
+{
+	struct nv10_context *nv10 = nv10_context( pipe );
+	struct draw_context *draw = nv10->draw;
+	unsigned i;
+
+	nv10_emit_hw_state(nv10);
+
+	/*
+	 * Map vertex buffers
+	 */
+	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+		if (nv10->vtxbuf[i].buffer) {
+			void *buf
+				= pipe->winsys->buffer_map(pipe->winsys,
+						nv10->vtxbuf[i].buffer,
+						PIPE_BUFFER_USAGE_CPU_READ);
+			draw_set_mapped_vertex_buffer(draw, i, buf);
+		}
+	}
+	/* Map index buffer, if present */
+	if (indexBuffer) {
+		void *mapped_indexes
+			= pipe->winsys->buffer_map(pipe->winsys, indexBuffer,
+					PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_element_buffer(draw, indexSize, mapped_indexes);
+	}
+	else {
+		/* no index/element buffer */
+		draw_set_mapped_element_buffer(draw, 0, NULL);
+	}
+
+	draw_set_mapped_constant_buffer(draw,
+					nv10->constbuf[PIPE_SHADER_VERTEX],
+					nv10->constbuf_nr[PIPE_SHADER_VERTEX]);
+
+	/* draw! */
+	draw_arrays(nv10->draw, prim, start, count);
+
+	/*
+	 * unmap vertex/index buffers
+	 */
+	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+		if (nv10->vtxbuf[i].buffer) {
+			pipe->winsys->buffer_unmap(pipe->winsys, nv10->vtxbuf[i].buffer);
+			draw_set_mapped_vertex_buffer(draw, i, NULL);
+		}
+	}
+	if (indexBuffer) {
+		pipe->winsys->buffer_unmap(pipe->winsys, indexBuffer);
+		draw_set_mapped_element_buffer(draw, 0, NULL);
+	}
+
+	return TRUE;
+}
+
+boolean nv10_draw_arrays( struct pipe_context *pipe,
+				 unsigned prim, unsigned start, unsigned count)
+{
+	return nv10_draw_elements(pipe, NULL, 0, prim, start, count);
+}
+
+
+
diff --git a/src/gallium/drivers/nv20/Makefile b/src/gallium/drivers/nv20/Makefile
new file mode 100644
index 00000000000..d777fd3d8b4
--- /dev/null
+++ b/src/gallium/drivers/nv20/Makefile
@@ -0,0 +1,29 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv20
+
+DRIVER_SOURCES = \
+	nv20_clear.c \
+	nv20_context.c \
+	nv20_fragprog.c \
+	nv20_fragtex.c \
+	nv20_miptree.c \
+	nv20_prim_vbuf.c \
+	nv20_screen.c \
+	nv20_state.c \
+	nv20_state_emit.c \
+	nv20_surface.c \
+	nv20_vbo.c
+#	nv20_vertprog.c
+
+C_SOURCES = \
+	$(COMMON_SOURCES) \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/drivers/nv20/nv20_clear.c b/src/gallium/drivers/nv20/nv20_clear.c
new file mode 100644
index 00000000000..81b6f3e78ac
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_clear.c
@@ -0,0 +1,12 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv20_context.h"
+
+void
+nv20_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, clearValue);
+}
diff --git a/src/gallium/drivers/nv20/nv20_context.c b/src/gallium/drivers/nv20/nv20_context.c
new file mode 100644
index 00000000000..2af5b0203ed
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_context.c
@@ -0,0 +1,296 @@
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+
+#include "nv20_context.h"
+#include "nv20_screen.h"
+
+static void
+nv20_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	draw_flush(nv20->draw);
+
+	FIRE_RING(fence);
+}
+
+static void
+nv20_destroy(struct pipe_context *pipe)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	if (nv20->draw)
+		draw_destroy(nv20->draw);
+
+	FREE(nv20);
+}
+
+static void nv20_init_hwctx(struct nv20_context *nv20)
+{
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_winsys *nvws = screen->nvws;
+	int i;
+	float projectionmatrix[16];
+
+	BEGIN_RING(kelvin, NV10TCL_DMA_NOTIFY, 1);
+	OUT_RING  (screen->sync->handle);
+	BEGIN_RING(kelvin, NV10TCL_DMA_IN_MEMORY0, 2);
+	OUT_RING  (nvws->channel->vram->handle);
+	OUT_RING  (nvws->channel->gart->handle);
+	BEGIN_RING(kelvin, NV10TCL_DMA_IN_MEMORY2, 2);
+	OUT_RING  (nvws->channel->vram->handle);
+	OUT_RING  (nvws->channel->vram->handle);
+
+	BEGIN_RING(kelvin, NV10TCL_NOP, 1);
+	OUT_RING  (0);
+
+	BEGIN_RING(kelvin, NV10TCL_RT_HORIZ, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+
+	BEGIN_RING(kelvin, NV10TCL_VIEWPORT_CLIP_HORIZ(0), 1);
+	OUT_RING  ((0x7ff<<16)|0x800);
+	BEGIN_RING(kelvin, NV10TCL_VIEWPORT_CLIP_VERT(0), 1);
+	OUT_RING  ((0x7ff<<16)|0x800);
+
+	for (i=1;i<8;i++) {
+		BEGIN_RING(kelvin, NV10TCL_VIEWPORT_CLIP_HORIZ(i), 1);
+		OUT_RING  (0);
+		BEGIN_RING(kelvin, NV10TCL_VIEWPORT_CLIP_VERT(i), 1);
+		OUT_RING  (0);
+	}
+
+	BEGIN_RING(kelvin, 0x290, 1);
+	OUT_RING  ((0x10<<16)|1);
+	BEGIN_RING(kelvin, 0x3f4, 1);
+	OUT_RING  (0);
+
+	BEGIN_RING(kelvin, NV10TCL_NOP, 1);
+	OUT_RING  (0);
+
+	if (nv20->screen->kelvin->grclass != NV10TCL) {
+		/* For nv11, nv17 */
+		BEGIN_RING(kelvin, 0x120, 3);
+		OUT_RING  (0);
+		OUT_RING  (1);
+		OUT_RING  (2);
+
+		BEGIN_RING(kelvin, NV10TCL_NOP, 1);
+		OUT_RING  (0);
+	}
+
+	BEGIN_RING(kelvin, NV10TCL_NOP, 1);
+	OUT_RING  (0);
+
+	/* Set state */
+	BEGIN_RING(kelvin, NV10TCL_FOG_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV10TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV10TCL_ALPHA_FUNC_FUNC, 2);
+	OUT_RING  (0x207);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV10TCL_TX_ENABLE(0), 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+
+	BEGIN_RING(kelvin, NV10TCL_RC_IN_ALPHA(0), 12);
+	OUT_RING  (0x30141010);
+	OUT_RING  (0);
+	OUT_RING  (0x20040000);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0x00000c00);
+	OUT_RING  (0);
+	OUT_RING  (0x00000c00);
+	OUT_RING  (0x18000000);
+	OUT_RING  (0x300e0300);
+	OUT_RING  (0x0c091c80);
+
+	BEGIN_RING(kelvin, NV10TCL_BLEND_FUNC_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV10TCL_DITHER_ENABLE, 2);
+	OUT_RING  (1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV10TCL_LINE_SMOOTH_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV10TCL_VERTEX_WEIGHT_ENABLE, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV10TCL_BLEND_FUNC_SRC, 4);
+	OUT_RING  (1);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0x8006);
+	BEGIN_RING(kelvin, NV10TCL_STENCIL_MASK, 8);
+	OUT_RING  (0xff);
+	OUT_RING  (0x207);
+	OUT_RING  (0);
+	OUT_RING  (0xff);
+	OUT_RING  (0x1e00);
+	OUT_RING  (0x1e00);
+	OUT_RING  (0x1e00);
+	OUT_RING  (0x1d01);
+	BEGIN_RING(kelvin, NV10TCL_NORMALIZE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV10TCL_FOG_ENABLE, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV10TCL_LIGHT_MODEL, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV10TCL_COLOR_CONTROL, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV10TCL_ENABLED_LIGHTS, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV10TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV10TCL_DEPTH_FUNC, 1);
+	OUT_RING  (0x201);
+	BEGIN_RING(kelvin, NV10TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV10TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV10TCL_POLYGON_OFFSET_FACTOR, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV10TCL_POINT_SIZE, 1);
+	OUT_RING  (8);
+	BEGIN_RING(kelvin, NV10TCL_POINT_PARAMETERS_ENABLE, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV10TCL_LINE_WIDTH, 1);
+	OUT_RING  (8);
+	BEGIN_RING(kelvin, NV10TCL_LINE_SMOOTH_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV10TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (0x1b02);
+	OUT_RING  (0x1b02);
+	BEGIN_RING(kelvin, NV10TCL_CULL_FACE, 2);
+	OUT_RING  (0x405);
+	OUT_RING  (0x901);
+	BEGIN_RING(kelvin, NV10TCL_POLYGON_SMOOTH_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV10TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV10TCL_CLIP_PLANE_ENABLE(0), 8);
+	for (i=0;i<8;i++) {
+		OUT_RING  (0);
+	}
+	BEGIN_RING(kelvin, NV10TCL_FOG_EQUATION_CONSTANT, 3);
+	OUT_RING  (0x3fc00000);	/* -1.50 */
+	OUT_RING  (0xbdb8aa0a);	/* -0.09 */
+	OUT_RING  (0);		/*  0.00 */
+
+	BEGIN_RING(kelvin, NV10TCL_NOP, 1);
+	OUT_RING  (0);
+
+	BEGIN_RING(kelvin, NV10TCL_FOG_MODE, 2);
+	OUT_RING  (0x802);
+	OUT_RING  (2);
+	/* for some reason VIEW_MATRIX_ENABLE need to be 6 instead of 4 when
+	 * using texturing, except when using the texture matrix
+	 */
+	BEGIN_RING(kelvin, NV10TCL_VIEW_MATRIX_ENABLE, 1);
+	OUT_RING  (6);
+	BEGIN_RING(kelvin, NV10TCL_COLOR_MASK, 1);
+	OUT_RING  (0x01010101);
+
+	/* Set vertex component */
+	BEGIN_RING(kelvin, NV10TCL_VERTEX_COL_4F_R, 4);
+	OUT_RINGf (1.0);
+	OUT_RINGf (1.0);
+	OUT_RINGf (1.0);
+	OUT_RINGf (1.0);
+	BEGIN_RING(kelvin, NV10TCL_VERTEX_COL2_3F_R, 3);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV10TCL_VERTEX_NOR_3F_X, 3);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RINGf (1.0);
+	BEGIN_RING(kelvin, NV10TCL_VERTEX_TX0_4F_S, 4);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (1.0);
+	BEGIN_RING(kelvin, NV10TCL_VERTEX_TX1_4F_S, 4);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (1.0);
+	BEGIN_RING(kelvin, NV10TCL_VERTEX_FOG_1F, 1);
+	OUT_RINGf (0.0);
+	BEGIN_RING(kelvin, NV10TCL_EDGEFLAG_ENABLE, 1);
+	OUT_RING  (1);
+
+	memset(projectionmatrix, 0, sizeof(projectionmatrix));
+	BEGIN_RING(kelvin, NV10TCL_PROJECTION_MATRIX(0), 16);
+	projectionmatrix[0*4+0] = 1.0;
+	projectionmatrix[1*4+1] = 1.0;
+	projectionmatrix[2*4+2] = 1.0;
+	projectionmatrix[3*4+3] = 1.0;
+	for (i=0;i<16;i++) {
+		OUT_RINGf  (projectionmatrix[i]);
+	}
+
+	BEGIN_RING(kelvin, NV10TCL_DEPTH_RANGE_NEAR, 2);
+	OUT_RING  (0.0);
+	OUT_RINGf  (16777216.0);
+
+	BEGIN_RING(kelvin, NV10TCL_VIEWPORT_SCALE_X, 4);
+	OUT_RINGf  (-2048.0);
+	OUT_RINGf  (-2048.0);
+	OUT_RINGf  (16777215.0 * 0.5);
+	OUT_RING  (0);
+
+	FIRE_RING (NULL);
+}
+
+static void
+nv20_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
+{
+}
+
+struct pipe_context *
+nv20_create(struct pipe_screen *pscreen, unsigned pctx_id)
+{
+	struct nv20_screen *screen = nv20_screen(pscreen);
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv20_context *nv20;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nv20 = CALLOC(1, sizeof(struct nv20_context));
+	if (!nv20)
+		return NULL;
+	nv20->screen = screen;
+	nv20->pctx_id = pctx_id;
+
+	nv20->nvws = nvws;
+
+	nv20->pipe.winsys = ws;
+	nv20->pipe.screen = pscreen;
+	nv20->pipe.destroy = nv20_destroy;
+	nv20->pipe.set_edgeflags = nv20_set_edgeflags;
+	nv20->pipe.draw_arrays = nv20_draw_arrays;
+	nv20->pipe.draw_elements = nv20_draw_elements;
+	nv20->pipe.clear = nv20_clear;
+	nv20->pipe.flush = nv20_flush;
+
+	nv20_init_surface_functions(nv20);
+	nv20_init_state_functions(nv20);
+
+	nv20->draw = draw_create();
+	assert(nv20->draw);
+	draw_set_rasterize_stage(nv20->draw, nv20_draw_vbuf_stage(nv20));
+
+	nv20_init_hwctx(nv20);
+
+	return &nv20->pipe;
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_context.h b/src/gallium/drivers/nv20/nv20_context.h
new file mode 100644
index 00000000000..8ad926db20a
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_context.h
@@ -0,0 +1,153 @@
+#ifndef __NV20_CONTEXT_H__
+#define __NV20_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+
+#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
+	struct nv20_screen *ctx = nv20->screen
+#include "nouveau/nouveau_push.h"
+
+#include "nv20_state.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+#define NV20_NEW_VERTPROG	(1 << 0)
+#define NV20_NEW_FRAGPROG	(1 << 1)
+#define NV20_NEW_VTXARRAYS	(1 << 2)
+#define NV20_NEW_BLEND		(1 << 3)
+#define NV20_NEW_BLENDCOL	(1 << 4)
+#define NV20_NEW_RAST 		(1 << 5)
+#define NV20_NEW_DSA  		(1 << 6)
+#define NV20_NEW_VIEWPORT	(1 << 7)
+#define NV20_NEW_SCISSOR	(1 << 8)
+#define NV20_NEW_FRAMEBUFFER	(1 << 9)
+
+#include "nv20_screen.h"
+
+struct nv20_context {
+	struct pipe_context pipe;
+
+	struct nouveau_winsys *nvws;
+	struct nv20_screen *screen;
+	unsigned pctx_id;
+
+	struct draw_context *draw;
+
+	uint32_t dirty;
+
+	struct nv20_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
+	struct nv20_miptree *tex_miptree[PIPE_MAX_SAMPLERS];
+	unsigned dirty_samplers;
+	unsigned fp_samplers;
+	unsigned vp_samplers;
+
+	uint32_t rt_enable;
+	struct pipe_buffer *rt[4];
+	struct pipe_buffer *zeta;
+	uint32_t lma_offset;
+
+	struct nv20_blend_state *blend;
+	struct pipe_blend_color *blend_color;
+	struct nv20_rasterizer_state *rast;
+	struct nv20_depth_stencil_alpha_state *dsa;
+	struct pipe_viewport_state *viewport;
+	struct pipe_scissor_state *scissor;
+	struct pipe_framebuffer_state *framebuffer;
+
+	//struct pipe_buffer *constbuf[PIPE_SHADER_TYPES];
+	float *constbuf[PIPE_SHADER_TYPES][32][4];
+	unsigned constbuf_nr[PIPE_SHADER_TYPES];
+
+	struct vertex_info vertex_info;
+
+	struct {
+		struct pipe_buffer *buffer;
+		uint32_t format;
+	} tex[2];
+
+	unsigned vb_enable;
+	struct {
+		struct pipe_buffer *buffer;
+		unsigned delta;
+	} vb[16];
+
+/*	struct {
+	
+		struct nouveau_resource *exec_heap;
+		struct nouveau_resource *data_heap;
+
+		struct nv20_vertex_program *active;
+
+		struct nv20_vertex_program *current;
+	} vertprog;
+*/
+	struct {
+		struct nv20_fragment_program *active;
+
+		struct nv20_fragment_program *current;
+		struct pipe_buffer *constant_buf;
+	} fragprog;
+
+	struct pipe_vertex_buffer  vtxbuf[PIPE_MAX_ATTRIBS];
+	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
+};
+
+static INLINE struct nv20_context *
+nv20_context(struct pipe_context *pipe)
+{
+	return (struct nv20_context *)pipe;
+}
+
+extern void nv20_init_state_functions(struct nv20_context *nv20);
+extern void nv20_init_surface_functions(struct nv20_context *nv20);
+
+extern void nv20_screen_init_miptree_functions(struct pipe_screen *pscreen);
+
+/* nv20_clear.c */
+extern void nv20_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+/* nv20_draw.c */
+extern struct draw_stage *nv20_draw_render_stage(struct nv20_context *nv20);
+
+/* nv20_fragprog.c */
+extern void nv20_fragprog_bind(struct nv20_context *,
+			       struct nv20_fragment_program *);
+extern void nv20_fragprog_destroy(struct nv20_context *,
+				  struct nv20_fragment_program *);
+
+/* nv20_fragtex.c */
+extern void nv20_fragtex_bind(struct nv20_context *);
+
+/* nv20_prim_vbuf.c */
+struct draw_stage *nv20_draw_vbuf_stage( struct nv20_context *nv20 );
+extern void nv20_vtxbuf_bind(struct nv20_context* nv20);
+
+/* nv20_state.c and friends */
+extern void nv20_emit_hw_state(struct nv20_context *nv20);
+extern void nv20_state_tex_update(struct nv20_context *nv20);
+
+/* nv20_vbo.c */
+extern boolean nv20_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv20_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *indexBuffer,
+                    unsigned indexSize,
+                    unsigned prim, unsigned start, unsigned count);
+
+
+#endif
diff --git a/src/gallium/drivers/nv20/nv20_fragprog.c b/src/gallium/drivers/nv20/nv20_fragprog.c
new file mode 100644
index 00000000000..4f496369dd3
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_fragprog.c
@@ -0,0 +1,21 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv20_context.h"
+
+void
+nv20_fragprog_bind(struct nv20_context *nv20, struct nv20_fragment_program *fp)
+{
+}
+
+void
+nv20_fragprog_destroy(struct nv20_context *nv20,
+		      struct nv20_fragment_program *fp)
+{
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_fragtex.c b/src/gallium/drivers/nv20/nv20_fragtex.c
new file mode 100644
index 00000000000..77c34897e2c
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_fragtex.c
@@ -0,0 +1,149 @@
+#include "nv20_context.h"
+
+static INLINE int log2i(int i)
+{
+	int r = 0;
+
+	if (i & 0xffff0000) {
+		i >>= 16;
+		r += 16;
+	}
+	if (i & 0x0000ff00) {
+		i >>= 8;
+		r += 8;
+	}
+	if (i & 0x000000f0) {
+		i >>= 4;
+		r += 4;
+	}
+	if (i & 0x0000000c) {
+		i >>= 2;
+		r += 2;
+	}
+	if (i & 0x00000002) {
+		r += 1;
+	}
+	return r;
+}
+
+#define _(m,tf)                                                                \
+{                                                                              \
+  TRUE,                                                                        \
+  PIPE_FORMAT_##m,                                                             \
+  NV10TCL_TX_FORMAT_FORMAT_##tf,                                               \
+}
+
+struct nv20_texture_format {
+	boolean defined;
+	uint	pipe;
+	int     format;
+};
+
+static struct nv20_texture_format
+nv20_texture_formats[] = {
+	_(A8R8G8B8_UNORM, A8R8G8B8),
+	_(A1R5G5B5_UNORM, A1R5G5B5),
+	_(A4R4G4B4_UNORM, A4R4G4B4),
+	_(L8_UNORM      , L8      ),
+	_(A8_UNORM      , A8      ),
+	_(A8L8_UNORM    , A8L8    ),
+//	_(RGB_DXT1      , DXT1,   ),
+//	_(RGBA_DXT1     , DXT1,   ),
+//	_(RGBA_DXT3     , DXT3,   ),
+//	_(RGBA_DXT5     , DXT5,   ),
+	{},
+};
+
+static struct nv20_texture_format *
+nv20_fragtex_format(uint pipe_format)
+{
+	struct nv20_texture_format *tf = nv20_texture_formats;
+
+	while (tf->defined) {
+		if (tf->pipe == pipe_format)
+			return tf;
+		tf++;
+	}
+
+	return NULL;
+}
+
+
+static void
+nv20_fragtex_build(struct nv20_context *nv20, int unit)
+{
+#if 0
+	struct nv20_sampler_state *ps = nv20->tex_sampler[unit];
+	struct nv20_miptree *nv20mt = nv20->tex_miptree[unit];
+	struct pipe_texture *pt = &nv20mt->base;
+	struct nv20_texture_format *tf;
+	uint32_t txf, txs, txp;
+
+	tf = nv20_fragtex_format(pt->format);
+	if (!tf || !tf->defined) {
+		NOUVEAU_ERR("Unsupported texture format: 0x%x\n", pt->format);
+		return;
+	}
+
+	txf  = tf->format << 8;
+	txf |= (pt->last_level + 1) << 16;
+	txf |= log2i(pt->width[0]) << 20;
+	txf |= log2i(pt->height[0]) << 24;
+	txf |= log2i(pt->depth[0]) << 28;
+	txf |= 8;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_CUBE:
+		txf |= NV10TCL_TX_FORMAT_CUBE_MAP;
+		/* fall-through */
+	case PIPE_TEXTURE_2D:
+		txf |= (2<<4);
+		break;
+	case PIPE_TEXTURE_1D:
+		txf |= (1<<4);
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return;
+	}
+
+	BEGIN_RING(kelvin, NV10TCL_TX_OFFSET(unit), 8);
+	OUT_RELOCl(nv20mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCd(nv20mt->buffer,txf,NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+	OUT_RING  (ps->wrap);
+	OUT_RING  (0x40000000); /* enable */
+	OUT_RING  (txs);
+	OUT_RING  (ps->filt | 0x2000 /* magic */);
+	OUT_RING  ((pt->width[0] << 16) | pt->height[0]);
+	OUT_RING  (ps->bcol);
+#endif
+}
+
+void
+nv20_fragtex_bind(struct nv20_context *nv20)
+{
+#if 0
+	struct nv20_fragment_program *fp = nv20->fragprog.active;
+	unsigned samplers, unit;
+
+	samplers = nv20->fp_samplers & ~fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		BEGIN_RING(kelvin, NV10TCL_TX_ENABLE(unit), 1);
+		OUT_RING  (0);
+	}
+
+	samplers = nv20->dirty_samplers & fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		nv20_fragtex_build(nv20, unit);
+	}
+
+	nv20->fp_samplers = fp->samplers;
+#endif
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_miptree.c b/src/gallium/drivers/nv20/nv20_miptree.c
new file mode 100644
index 00000000000..c6106d58c43
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_miptree.c
@@ -0,0 +1,156 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "nv20_context.h"
+#include "nv20_screen.h"
+
+static void
+nv20_miptree_layout(struct nv20_miptree *nv20mt)
+{
+	struct pipe_texture *pt = &nv20mt->base;
+	boolean swizzled = FALSE;
+	uint width = pt->width[0], height = pt->height[0];
+	uint offset = 0;
+	int nr_faces, l, f;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		nr_faces = 6;
+	} else {
+		nr_faces = 1;
+	}
+	
+	for (l = 0; l <= pt->last_level; l++) {
+		pt->width[l] = width;
+		pt->height[l] = height;
+		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
+		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
+
+		if (swizzled)
+			nv20mt->level[l].pitch = pt->nblocksx[l] * pt->block.size;
+		else
+			nv20mt->level[l].pitch = pt->nblocksx[0] * pt->block.size;
+		nv20mt->level[l].pitch = (nv20mt->level[l].pitch + 63) & ~63;
+
+		nv20mt->level[l].image_offset =
+			CALLOC(nr_faces, sizeof(unsigned));
+
+		width  = MAX2(1, width  >> 1);
+		height = MAX2(1, height >> 1);
+
+	}
+
+	for (f = 0; f < nr_faces; f++) {
+		for (l = 0; l <= pt->last_level; l++) {
+			nv20mt->level[l].image_offset[f] = offset;
+			offset += nv20mt->level[l].pitch * pt->height[l];
+		}
+	}
+
+	nv20mt->total_size = offset;
+}
+
+static struct pipe_texture *
+nv20_miptree_create(struct pipe_screen *screen, const struct pipe_texture *pt)
+{
+	struct pipe_winsys *ws = screen->winsys;
+	struct nv20_miptree *mt;
+
+	mt = MALLOC(sizeof(struct nv20_miptree));
+	if (!mt)
+		return NULL;
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = screen;
+
+	nv20_miptree_layout(mt);
+
+	mt->buffer = ws->buffer_create(ws, 256, PIPE_BUFFER_USAGE_PIXEL,
+					   mt->total_size);
+	if (!mt->buffer) {
+		FREE(mt);
+		return NULL;
+	}
+	
+	return &mt->base;
+}
+
+static void
+nv20_miptree_release(struct pipe_screen *screen, struct pipe_texture **pt)
+{
+	struct pipe_texture *mt = *pt;
+
+	*pt = NULL;
+	if (--mt->refcount <= 0) {
+		struct nv20_miptree *nv20mt = (struct nv20_miptree *)mt;
+		int l;
+
+		pipe_buffer_reference(screen, &nv20mt->buffer, NULL);
+		for (l = 0; l <= mt->last_level; l++) {
+			if (nv20mt->level[l].image_offset)
+				FREE(nv20mt->level[l].image_offset);
+		}
+		FREE(nv20mt);
+	}
+}
+
+static struct pipe_surface *
+nv20_miptree_surface_get(struct pipe_screen *screen, struct pipe_texture *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct nv20_miptree *nv20mt = (struct nv20_miptree *)pt;
+	struct pipe_surface *ps;
+
+	ps = CALLOC_STRUCT(pipe_surface);
+	if (!ps)
+		return NULL;
+	pipe_texture_reference(&ps->texture, pt);
+	pipe_buffer_reference(screen, &ps->buffer, nv20mt->buffer);
+	ps->format = pt->format;
+	ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->block = pt->block;
+	ps->nblocksx = pt->nblocksx[level];
+	ps->nblocksy = pt->nblocksy[level];
+	ps->stride = nv20mt->level[level].pitch;
+	ps->usage = flags;
+	ps->status = PIPE_SURFACE_STATUS_DEFINED;
+	ps->refcount = 1;
+	ps->winsys = screen->winsys;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		ps->offset = nv20mt->level[level].image_offset[face];
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		ps->offset = nv20mt->level[level].image_offset[zslice];
+	} else {
+		ps->offset = nv20mt->level[level].image_offset[0];
+	}
+
+	return ps;
+}
+
+static void
+nv20_miptree_surface_release(struct pipe_screen *pscreen,
+			     struct pipe_surface **psurface)
+{
+	struct pipe_surface *ps = *psurface;
+
+	*psurface = NULL;
+	if (--ps->refcount > 0)
+		return;
+
+	pipe_texture_reference(&ps->texture, NULL);
+	pipe_buffer_reference(pscreen, &ps->buffer, NULL);
+	FREE(ps);
+}
+
+void nv20_screen_init_miptree_functions(struct pipe_screen *pscreen)
+{
+	pscreen->texture_create = nv20_miptree_create;
+	pscreen->texture_release = nv20_miptree_release;
+	pscreen->get_tex_surface = nv20_miptree_surface_get;
+	pscreen->tex_surface_release = nv20_miptree_surface_release;
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_prim_vbuf.c b/src/gallium/drivers/nv20/nv20_prim_vbuf.c
new file mode 100644
index 00000000000..a51d657d274
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_prim_vbuf.c
@@ -0,0 +1,240 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \file
+ * Build post-transformation, post-clipping vertex buffers and element
+ * lists by hooking into the end of the primitive pipeline and
+ * manipulating the vertex_id field in the vertex headers.
+ *
+ * XXX: work in progress 
+ * 
+ * \author José Fonseca <[email protected]>
+ * \author Keith Whitwell <[email protected]>
+ */
+
+
+#include "pipe/p_debug.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_winsys.h"
+
+#include "nv20_context.h"
+#include "nv20_state.h"
+
+#include "draw/draw_vbuf.h"
+
+/**
+ * Primitive renderer for nv20.
+ */
+struct nv20_vbuf_render {
+	struct vbuf_render base;
+
+	struct nv20_context *nv20;   
+
+	/** Vertex buffer */
+	struct pipe_buffer* buffer;
+
+	/** Vertex size in bytes */
+	unsigned vertex_size;
+
+	/** Hardware primitive */
+	unsigned hwprim;
+};
+
+
+void nv20_vtxbuf_bind( struct nv20_context* nv20 )
+{
+	int i;
+	for(i = 0; i < 8; i++) {
+		BEGIN_RING(kelvin, NV10TCL_VERTEX_ARRAY_ATTRIB_OFFSET(i), 1);
+		OUT_RING(0/*nv20->vtxbuf*/);
+		BEGIN_RING(kelvin, NV10TCL_VERTEX_ARRAY_ATTRIB_FORMAT(i) ,1);
+		OUT_RING(0/*XXX*/);
+	}
+}
+
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct nv20_vbuf_render *
+nv20_vbuf_render( struct vbuf_render *render )
+{
+	assert(render);
+	return (struct nv20_vbuf_render *)render;
+}
+
+
+static const struct vertex_info *
+nv20_vbuf_render_get_vertex_info( struct vbuf_render *render )
+{
+	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
+	struct nv20_context *nv20 = nv20_render->nv20;
+
+	nv20_emit_hw_state(nv20);
+
+	return &nv20->vertex_info;
+}
+
+
+static void *
+nv20_vbuf_render_allocate_vertices( struct vbuf_render *render,
+		ushort vertex_size,
+		ushort nr_vertices )
+{
+	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
+	struct nv20_context *nv20 = nv20_render->nv20;
+	struct pipe_winsys *winsys = nv20->pipe.winsys;
+	size_t size = (size_t)vertex_size * (size_t)nr_vertices;
+
+	assert(!nv20_render->buffer);
+	nv20_render->buffer = winsys->buffer_create(winsys, 64, PIPE_BUFFER_USAGE_VERTEX, size);
+
+	nv20->dirty |= NV20_NEW_VTXARRAYS;
+
+	return winsys->buffer_map(winsys, 
+			nv20_render->buffer, 
+			PIPE_BUFFER_USAGE_CPU_WRITE);
+}
+
+
+static void 
+nv20_vbuf_render_set_primitive( struct vbuf_render *render, 
+		unsigned prim )
+{
+	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
+	nv20_render->hwprim = prim + 1;
+}
+
+
+static void 
+nv20_vbuf_render_draw( struct vbuf_render *render,
+		const ushort *indices,
+		uint nr_indices)
+{
+	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
+	struct nv20_context *nv20 = nv20_render->nv20;
+	int push, i;
+
+	nv20_emit_hw_state(nv20);
+
+	BEGIN_RING(kelvin, NV10TCL_VERTEX_ARRAY_OFFSET_POS, 1);
+	OUT_RELOCl(nv20_render->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+
+	BEGIN_RING(kelvin, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING(nv20_render->hwprim);
+
+	if (nr_indices & 1) {
+		BEGIN_RING(kelvin, NV10TCL_VB_ELEMENT_U32, 1);
+		OUT_RING  (indices[0]);
+		indices++; nr_indices--;
+	}
+
+	while (nr_indices) {
+		// XXX too big/small ? check the size
+		push = MIN2(nr_indices, 1200 * 2);
+
+		BEGIN_RING_NI(kelvin, NV10TCL_VB_ELEMENT_U16, push >> 1);
+		for (i = 0; i < push; i+=2)
+			OUT_RING((indices[i+1] << 16) | indices[i]);
+
+		nr_indices -= push;
+		indices  += push;
+	}
+
+	BEGIN_RING(kelvin, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING  (0);
+}
+
+
+static void
+nv20_vbuf_render_release_vertices( struct vbuf_render *render,
+		void *vertices, 
+		unsigned vertex_size,
+		unsigned vertices_used )
+{
+	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
+	struct nv20_context *nv20 = nv20_render->nv20;
+	struct pipe_winsys *winsys = nv20->pipe.winsys;
+	struct pipe_screen *pscreen = &nv20->screen->pipe;
+
+	assert(nv20_render->buffer);
+	winsys->buffer_unmap(winsys, nv20_render->buffer);
+	pipe_buffer_reference(pscreen, &nv20_render->buffer, NULL);
+}
+
+
+static void
+nv20_vbuf_render_destroy( struct vbuf_render *render )
+{
+	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
+	FREE(nv20_render);
+}
+
+
+/**
+ * Create a new primitive render.
+ */
+static struct vbuf_render *
+nv20_vbuf_render_create( struct nv20_context *nv20 )
+{
+	struct nv20_vbuf_render *nv20_render = CALLOC_STRUCT(nv20_vbuf_render);
+
+	nv20_render->nv20 = nv20;
+
+	nv20_render->base.max_vertex_buffer_bytes = 16*1024;
+	nv20_render->base.max_indices = 1024;
+	nv20_render->base.get_vertex_info = nv20_vbuf_render_get_vertex_info;
+	nv20_render->base.allocate_vertices = nv20_vbuf_render_allocate_vertices;
+	nv20_render->base.set_primitive = nv20_vbuf_render_set_primitive;
+	nv20_render->base.draw = nv20_vbuf_render_draw;
+	nv20_render->base.release_vertices = nv20_vbuf_render_release_vertices;
+	nv20_render->base.destroy = nv20_vbuf_render_destroy;
+
+	return &nv20_render->base;
+}
+
+
+/**
+ * Create a new primitive vbuf/render stage.
+ */
+struct draw_stage *nv20_draw_vbuf_stage( struct nv20_context *nv20 )
+{
+	struct vbuf_render *render;
+	struct draw_stage *stage;
+
+	render = nv20_vbuf_render_create(nv20);
+	if(!render)
+		return NULL;
+
+	stage = draw_vbuf_stage( nv20->draw, render );
+	if(!stage) {
+		render->destroy(render);
+		return NULL;
+	}
+
+	return stage;
+}
diff --git a/src/gallium/drivers/nv20/nv20_screen.c b/src/gallium/drivers/nv20/nv20_screen.c
new file mode 100644
index 00000000000..b7f5ea85128
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_screen.c
@@ -0,0 +1,208 @@
+#include "pipe/p_screen.h"
+
+#include "nv20_context.h"
+#include "nv20_screen.h"
+
+static const char *
+nv20_screen_get_name(struct pipe_screen *screen)
+{
+	struct nv20_screen *nv20screen = nv20_screen(screen);
+	struct nouveau_device *dev = nv20screen->nvws->channel->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nv20_screen_get_vendor(struct pipe_screen *screen)
+{
+	return "nouveau";
+}
+
+static int
+nv20_screen_get_param(struct pipe_screen *screen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 2;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 0;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 0;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 0;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 1;
+	case PIPE_CAP_POINT_SPRITE:
+		return 0;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 1;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 0;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 12;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 12;
+	case NOUVEAU_CAP_HW_VTXBUF:
+	case NOUVEAU_CAP_HW_IDXBUF:
+		return 0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv20_screen_get_paramf(struct pipe_screen *screen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 10.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 64.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 2.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 4.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static boolean
+nv20_screen_is_format_supported(struct pipe_screen *screen,
+				enum pipe_format format,
+				enum pipe_texture_target target,
+				unsigned tex_usage, unsigned geom_flags)
+{
+	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_A1R5G5B5_UNORM:
+		case PIPE_FORMAT_A4R4G4B4_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static void *
+nv20_surface_map(struct pipe_screen *screen, struct pipe_surface *surface,
+		 unsigned flags )
+{
+	struct pipe_winsys *ws = screen->winsys;
+	void *map;
+
+	map = ws->buffer_map(ws, surface->buffer, flags);
+	if (!map)
+		return NULL;
+
+	return map + surface->offset;
+}
+
+static void
+nv20_surface_unmap(struct pipe_screen *screen, struct pipe_surface *surface)
+{
+	struct pipe_winsys *ws = screen->winsys;
+
+	ws->buffer_unmap(ws, surface->buffer);
+}
+
+static void
+nv20_screen_destroy(struct pipe_screen *pscreen)
+{
+	struct nv20_screen *screen = nv20_screen(pscreen);
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nvws->notifier_free(&screen->sync);
+	nvws->grobj_free(&screen->kelvin);
+
+	FREE(pscreen);
+}
+
+struct pipe_screen *
+nv20_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+{
+	struct nv20_screen *screen = CALLOC_STRUCT(nv20_screen);
+	unsigned celsius_class;
+	unsigned chipset = nvws->channel->device->chipset;
+	int ret;
+
+	if (!screen)
+		return NULL;
+	screen->nvws = nvws;
+
+	/* 3D object */
+	if (chipset>=0x20)
+		celsius_class=NV11TCL;
+	else if (chipset>=0x17)
+		celsius_class=NV17TCL;
+	else if (chipset>=0x11)
+		celsius_class=NV11TCL;
+	else
+		celsius_class=NV10TCL;
+
+	if (!celsius_class) {
+		NOUVEAU_ERR("Unknown nv1x chipset: nv%02x\n", chipset);
+		return NULL;
+	}
+
+	ret = nvws->grobj_alloc(nvws, celsius_class, &screen->kelvin);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return FALSE;
+	}
+
+	/* Notifier for sync purposes */
+	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv20_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	screen->pipe.winsys = ws;
+	screen->pipe.destroy = nv20_screen_destroy;
+
+	screen->pipe.get_name = nv20_screen_get_name;
+	screen->pipe.get_vendor = nv20_screen_get_vendor;
+	screen->pipe.get_param = nv20_screen_get_param;
+	screen->pipe.get_paramf = nv20_screen_get_paramf;
+
+	screen->pipe.is_format_supported = nv20_screen_is_format_supported;
+
+	screen->pipe.surface_map = nv20_surface_map;
+	screen->pipe.surface_unmap = nv20_surface_unmap;
+
+	nv20_screen_init_miptree_functions(&screen->pipe);
+
+	return &screen->pipe;
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_screen.h b/src/gallium/drivers/nv20/nv20_screen.h
new file mode 100644
index 00000000000..8f2f2e341db
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_screen.h
@@ -0,0 +1,22 @@
+#ifndef __NV20_SCREEN_H__
+#define __NV20_SCREEN_H__
+
+#include "pipe/p_screen.h"
+
+struct nv20_screen {
+	struct pipe_screen pipe;
+
+	struct nouveau_winsys *nvws;
+
+	/* HW graphics objects */
+	struct nouveau_grobj *kelvin;
+	struct nouveau_notifier *sync;
+};
+
+static INLINE struct nv20_screen *
+nv20_screen(struct pipe_screen *screen)
+{
+	return (struct nv20_screen *)screen;
+}
+
+#endif
diff --git a/src/gallium/drivers/nv20/nv20_state.c b/src/gallium/drivers/nv20/nv20_state.c
new file mode 100644
index 00000000000..c3b87230b71
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_state.c
@@ -0,0 +1,588 @@
+#include "draw/draw_context.h"
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv20_context.h"
+#include "nv20_state.h"
+
+static void *
+nv20_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nv20_blend_state *cb;
+
+	cb = MALLOC(sizeof(struct nv20_blend_state));
+
+	cb->b_enable = cso->blend_enable ? 1 : 0;
+	cb->b_srcfunc = ((nvgl_blend_func(cso->alpha_src_factor)<<16) |
+			 (nvgl_blend_func(cso->rgb_src_factor)));
+	cb->b_dstfunc = ((nvgl_blend_func(cso->alpha_dst_factor)<<16) |
+			 (nvgl_blend_func(cso->rgb_dst_factor)));
+
+	cb->c_mask = (((cso->colormask & PIPE_MASK_A) ? (0x01<<24) : 0) |
+		      ((cso->colormask & PIPE_MASK_R) ? (0x01<<16) : 0) |
+		      ((cso->colormask & PIPE_MASK_G) ? (0x01<< 8) : 0) |
+		      ((cso->colormask & PIPE_MASK_B) ? (0x01<< 0) : 0));
+
+	cb->d_enable = cso->dither ? 1 : 0;
+
+	return (void *)cb;
+}
+
+static void
+nv20_blend_state_bind(struct pipe_context *pipe, void *blend)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->blend = (struct nv20_blend_state*)blend;
+
+	nv20->dirty |= NV20_NEW_BLEND;
+}
+
+static void
+nv20_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+
+static INLINE unsigned
+wrap_mode(unsigned wrap) {
+	unsigned ret;
+
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		ret = NV10TCL_TX_FORMAT_WRAP_S_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		ret = NV10TCL_TX_FORMAT_WRAP_S_MIRRORED_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		ret = NV10TCL_TX_FORMAT_WRAP_S_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		ret = NV10TCL_TX_FORMAT_WRAP_S_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_CLAMP:
+		ret = NV10TCL_TX_FORMAT_WRAP_S_CLAMP;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		ret = NV10TCL_TX_FORMAT_WRAP_S_REPEAT;
+		break;
+	}
+
+	return ret >> NV10TCL_TX_FORMAT_WRAP_S_SHIFT;
+}
+
+static void *
+nv20_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	struct nv20_sampler_state *ps;
+	uint32_t filter = 0;
+
+	ps = MALLOC(sizeof(struct nv20_sampler_state));
+
+	ps->wrap = ((wrap_mode(cso->wrap_s) << NV10TCL_TX_FORMAT_WRAP_S_SHIFT) |
+		    (wrap_mode(cso->wrap_t) << NV10TCL_TX_FORMAT_WRAP_T_SHIFT));
+
+	ps->en = 0;
+	if (cso->max_anisotropy > 1.0) {
+		/* no idea, binary driver sets it, works without it.. meh.. */
+		ps->wrap |= (1 << 5);
+
+/*		if (cso->max_anisotropy >= 16.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_16X;
+		} else
+		if (cso->max_anisotropy >= 12.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_12X;
+		} else
+		if (cso->max_anisotropy >= 10.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_10X;
+		} else
+		if (cso->max_anisotropy >= 8.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_8X;
+		} else
+		if (cso->max_anisotropy >= 6.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_6X;
+		} else
+		if (cso->max_anisotropy >= 4.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_4X;
+		} else {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_2X;
+		}*/
+	}
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		filter |= NV10TCL_TX_FILTER_MAGNIFY_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		filter |= NV10TCL_TX_FILTER_MAGNIFY_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV10TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST;
+			break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV10TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV10TCL_TX_FILTER_MINIFY_LINEAR;
+			break;
+		}
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV10TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST;
+		break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV10TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV10TCL_TX_FILTER_MINIFY_NEAREST;
+			break;
+		}
+		break;
+	}
+
+	ps->filt = filter;
+
+/*	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+		switch (cso->compare_func) {
+		case PIPE_FUNC_NEVER:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_NEVER;
+			break;
+		case PIPE_FUNC_GREATER:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_GREATER;
+			break;
+		case PIPE_FUNC_EQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_EQUAL;
+			break;
+		case PIPE_FUNC_GEQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_GEQUAL;
+			break;
+		case PIPE_FUNC_LESS:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_LESS;
+			break;
+		case PIPE_FUNC_NOTEQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_NOTEQUAL;
+			break;
+		case PIPE_FUNC_LEQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_LEQUAL;
+			break;
+		case PIPE_FUNC_ALWAYS:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_ALWAYS;
+			break;
+		default:
+			break;
+		}
+	}*/
+
+	ps->bcol = ((float_to_ubyte(cso->border_color[3]) << 24) |
+		    (float_to_ubyte(cso->border_color[0]) << 16) |
+		    (float_to_ubyte(cso->border_color[1]) <<  8) |
+		    (float_to_ubyte(cso->border_color[2]) <<  0));
+
+	return (void *)ps;
+}
+
+static void
+nv20_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv20->tex_sampler[unit] = sampler[unit];
+		nv20->dirty_samplers |= (1 << unit);
+	}
+}
+
+static void
+nv20_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nv20_set_sampler_texture(struct pipe_context *pipe, unsigned nr,
+			 struct pipe_texture **miptree)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv20->tex_miptree[unit] = (struct nv20_miptree *)miptree[unit];
+		nv20->dirty_samplers |= (1 << unit);
+	}
+}
+
+static void *
+nv20_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nv20_rasterizer_state *rs;
+	int i;
+
+	/*XXX: ignored:
+	 * 	light_twoside
+	 * 	offset_cw/ccw -nohw
+	 * 	scissor
+	 * 	point_smooth -nohw
+	 * 	multisample
+	 * 	offset_units / offset_scale
+	 */
+	rs = MALLOC(sizeof(struct nv20_rasterizer_state));
+
+	rs->templ = cso;
+	
+	rs->shade_model = cso->flatshade ? 0x1d00 : 0x1d01;
+
+	rs->line_width = (unsigned char)(cso->line_width * 8.0) & 0xff;
+	rs->line_smooth_en = cso->line_smooth ? 1 : 0;
+
+	rs->point_size = *(uint32_t*)&cso->point_size;
+
+	rs->poly_smooth_en = cso->poly_smooth ? 1 : 0;
+
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		rs->front_face = NV10TCL_FRONT_FACE_CCW;
+		rs->poly_mode_front = nvgl_polygon_mode(cso->fill_ccw);
+		rs->poly_mode_back  = nvgl_polygon_mode(cso->fill_cw);
+	} else {
+		rs->front_face = NV10TCL_FRONT_FACE_CW;
+		rs->poly_mode_front = nvgl_polygon_mode(cso->fill_cw);
+		rs->poly_mode_back  = nvgl_polygon_mode(cso->fill_ccw);
+	}
+
+	switch (cso->cull_mode) {
+	case PIPE_WINDING_CCW:
+		rs->cull_face_en = 1;
+		if (cso->front_winding == PIPE_WINDING_CCW)
+			rs->cull_face    = NV10TCL_CULL_FACE_FRONT;
+		else
+			rs->cull_face    = NV10TCL_CULL_FACE_BACK;
+		break;
+	case PIPE_WINDING_CW:
+		rs->cull_face_en = 1;
+		if (cso->front_winding == PIPE_WINDING_CW)
+			rs->cull_face    = NV10TCL_CULL_FACE_FRONT;
+		else
+			rs->cull_face    = NV10TCL_CULL_FACE_BACK;
+		break;
+	case PIPE_WINDING_BOTH:
+		rs->cull_face_en = 1;
+		rs->cull_face    = NV10TCL_CULL_FACE_FRONT_AND_BACK;
+		break;
+	case PIPE_WINDING_NONE:
+	default:
+		rs->cull_face_en = 0;
+		rs->cull_face    = 0;
+		break;
+	}
+
+	if (cso->point_sprite) {
+		rs->point_sprite = (1 << 0);
+		for (i = 0; i < 8; i++) {
+			if (cso->sprite_coord_mode[i] != PIPE_SPRITE_COORD_NONE)
+				rs->point_sprite |= (1 << (8 + i));
+		}
+	} else {
+		rs->point_sprite = 0;
+	}
+
+	return (void *)rs;
+}
+
+static void
+nv20_rasterizer_state_bind(struct pipe_context *pipe, void *rast)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->rast = (struct nv20_rasterizer_state*)rast;
+
+	draw_set_rasterizer_state(nv20->draw, (nv20->rast ? nv20->rast->templ : NULL));
+
+	nv20->dirty |= NV20_NEW_RAST;
+}
+
+static void
+nv20_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void *
+nv20_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nv20_depth_stencil_alpha_state *hw;
+
+	hw = MALLOC(sizeof(struct nv20_depth_stencil_alpha_state));
+
+	hw->depth.func		= nvgl_comparison_op(cso->depth.func);
+	hw->depth.write_enable	= cso->depth.writemask ? 1 : 0;
+	hw->depth.test_enable	= cso->depth.enabled ? 1 : 0;
+
+	hw->stencil.enable = cso->stencil[0].enabled ? 1 : 0;
+	hw->stencil.wmask = cso->stencil[0].write_mask;
+	hw->stencil.func = nvgl_comparison_op(cso->stencil[0].func);
+	hw->stencil.ref	= cso->stencil[0].ref_value;
+	hw->stencil.vmask = cso->stencil[0].value_mask;
+	hw->stencil.fail = nvgl_stencil_op(cso->stencil[0].fail_op);
+	hw->stencil.zfail = nvgl_stencil_op(cso->stencil[0].zfail_op);
+	hw->stencil.zpass = nvgl_stencil_op(cso->stencil[0].zpass_op);
+
+	hw->alpha.enabled = cso->alpha.enabled ? 1 : 0;
+	hw->alpha.func = nvgl_comparison_op(cso->alpha.func);
+	hw->alpha.ref  = float_to_ubyte(cso->alpha.ref);
+
+	return (void *)hw;
+}
+
+static void
+nv20_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *dsa)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->dsa = (struct nv20_depth_stencil_alpha_state*)dsa;
+
+	nv20->dirty |= NV20_NEW_DSA;
+}
+
+static void
+nv20_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void *
+nv20_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *templ)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	return draw_create_vertex_shader(nv20->draw, templ);
+}
+
+static void
+nv20_vp_state_bind(struct pipe_context *pipe, void *shader)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	draw_bind_vertex_shader(nv20->draw, (struct draw_vertex_shader *) shader);
+
+	nv20->dirty |= NV20_NEW_VERTPROG;
+}
+
+static void
+nv20_vp_state_delete(struct pipe_context *pipe, void *shader)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	draw_delete_vertex_shader(nv20->draw, (struct draw_vertex_shader *) shader);
+}
+
+static void *
+nv20_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv20_fragment_program *fp;
+
+	fp = CALLOC(1, sizeof(struct nv20_fragment_program));
+	fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	
+	tgsi_scan_shader(cso->tokens, &fp->info);
+
+	return (void *)fp;
+}
+
+static void
+nv20_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	struct nv20_fragment_program *fp = hwcso;
+
+	nv20->fragprog.current = fp;
+	nv20->dirty |= NV20_NEW_FRAGPROG;
+}
+
+static void
+nv20_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	struct nv20_fragment_program *fp = hwcso;
+
+	nv20_fragprog_destroy(nv20, fp);
+	FREE((void*)fp->pipe.tokens);
+	FREE(fp);
+}
+
+static void
+nv20_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->blend_color = (struct pipe_blend_color*)bcol;
+
+	nv20->dirty |= NV20_NEW_BLENDCOL;
+}
+
+static void
+nv20_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	draw_set_clip_state(nv20->draw, clip);
+}
+
+static void
+nv20_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+
+	assert(shader < PIPE_SHADER_TYPES);
+	assert(index == 0);
+
+	if (buf) {
+		void *mapped;
+		if (buf->size && (mapped = ws->buffer_map(ws, buf->buffer, PIPE_BUFFER_USAGE_CPU_READ)))
+		{
+			memcpy(nv20->constbuf[shader], mapped, buf->size);
+			nv20->constbuf_nr[shader] =
+				buf->size / (4 * sizeof(float));
+			ws->buffer_unmap(ws, buf->buffer);
+		}
+	}
+}
+
+static void
+nv20_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->framebuffer = (struct pipe_framebuffer_state*)fb;
+
+	nv20->dirty |= NV20_NEW_FRAMEBUFFER;
+}
+
+static void
+nv20_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	NOUVEAU_ERR("line stipple hahaha\n");
+}
+
+static void
+nv20_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->scissor = (struct pipe_scissor_state*)s;
+
+	nv20->dirty |= NV20_NEW_SCISSOR;
+}
+
+static void
+nv20_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->viewport = (struct pipe_viewport_state*)vpt;
+
+	draw_set_viewport_state(nv20->draw, nv20->viewport);
+
+	nv20->dirty |= NV20_NEW_VIEWPORT;
+}
+
+static void
+nv20_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	memcpy(nv20->vtxbuf, vb, sizeof(*vb) * count);
+	nv20->dirty |= NV20_NEW_VTXARRAYS;
+
+	draw_set_vertex_buffers(nv20->draw, count, vb);
+}
+
+static void
+nv20_set_vertex_elements(struct pipe_context *pipe, unsigned count,
+			 const struct pipe_vertex_element *ve)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	memcpy(nv20->vtxelt, ve, sizeof(*ve) * count);
+	nv20->dirty |= NV20_NEW_VTXARRAYS;
+
+	draw_set_vertex_elements(nv20->draw, count, ve);
+}
+
+void
+nv20_init_state_functions(struct nv20_context *nv20)
+{
+	nv20->pipe.create_blend_state = nv20_blend_state_create;
+	nv20->pipe.bind_blend_state = nv20_blend_state_bind;
+	nv20->pipe.delete_blend_state = nv20_blend_state_delete;
+
+	nv20->pipe.create_sampler_state = nv20_sampler_state_create;
+	nv20->pipe.bind_sampler_states = nv20_sampler_state_bind;
+	nv20->pipe.delete_sampler_state = nv20_sampler_state_delete;
+	nv20->pipe.set_sampler_textures = nv20_set_sampler_texture;
+
+	nv20->pipe.create_rasterizer_state = nv20_rasterizer_state_create;
+	nv20->pipe.bind_rasterizer_state = nv20_rasterizer_state_bind;
+	nv20->pipe.delete_rasterizer_state = nv20_rasterizer_state_delete;
+
+	nv20->pipe.create_depth_stencil_alpha_state =
+		nv20_depth_stencil_alpha_state_create;
+	nv20->pipe.bind_depth_stencil_alpha_state =
+		nv20_depth_stencil_alpha_state_bind;
+	nv20->pipe.delete_depth_stencil_alpha_state =
+		nv20_depth_stencil_alpha_state_delete;
+
+	nv20->pipe.create_vs_state = nv20_vp_state_create;
+	nv20->pipe.bind_vs_state = nv20_vp_state_bind;
+	nv20->pipe.delete_vs_state = nv20_vp_state_delete;
+
+	nv20->pipe.create_fs_state = nv20_fp_state_create;
+	nv20->pipe.bind_fs_state = nv20_fp_state_bind;
+	nv20->pipe.delete_fs_state = nv20_fp_state_delete;
+
+	nv20->pipe.set_blend_color = nv20_set_blend_color;
+	nv20->pipe.set_clip_state = nv20_set_clip_state;
+	nv20->pipe.set_constant_buffer = nv20_set_constant_buffer;
+	nv20->pipe.set_framebuffer_state = nv20_set_framebuffer_state;
+	nv20->pipe.set_polygon_stipple = nv20_set_polygon_stipple;
+	nv20->pipe.set_scissor_state = nv20_set_scissor_state;
+	nv20->pipe.set_viewport_state = nv20_set_viewport_state;
+
+	nv20->pipe.set_vertex_buffers = nv20_set_vertex_buffers;
+	nv20->pipe.set_vertex_elements = nv20_set_vertex_elements;
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_state.h b/src/gallium/drivers/nv20/nv20_state.h
new file mode 100644
index 00000000000..34f402fdcbf
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_state.h
@@ -0,0 +1,139 @@
+#ifndef __NV20_STATE_H__
+#define __NV20_STATE_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nv20_blend_state {
+	uint32_t b_enable;
+	uint32_t b_srcfunc;
+	uint32_t b_dstfunc;
+
+	uint32_t c_mask;
+
+	uint32_t d_enable;
+};
+
+struct nv20_sampler_state {
+	uint32_t wrap;
+	uint32_t en;
+	uint32_t filt;
+	uint32_t bcol;
+};
+
+struct nv20_rasterizer_state {
+	uint32_t shade_model;
+
+	uint32_t line_width;
+	uint32_t line_smooth_en;
+
+	uint32_t point_size;
+
+	uint32_t poly_smooth_en;
+	
+	uint32_t poly_mode_front;
+	uint32_t poly_mode_back;
+
+	uint32_t front_face;
+	uint32_t cull_face;
+	uint32_t cull_face_en;
+
+	uint32_t point_sprite;
+
+	const struct pipe_rasterizer_state *templ;
+};
+
+struct nv20_vertex_program_exec {
+	uint32_t data[4];
+	boolean has_branch_offset;
+	int const_index;
+};
+
+struct nv20_vertex_program_data {
+	int index; /* immediates == -1 */
+	float value[4];
+};
+
+struct nv20_vertex_program {
+	const struct pipe_shader_state *pipe;
+
+	boolean translated;
+	struct nv20_vertex_program_exec *insns;
+	unsigned nr_insns;
+	struct nv20_vertex_program_data *consts;
+	unsigned nr_consts;
+
+	struct nouveau_resource *exec;
+	unsigned exec_start;
+	struct nouveau_resource *data;
+	unsigned data_start;
+	unsigned data_start_min;
+
+	uint32_t ir;
+	uint32_t or;
+};
+
+struct nv20_fragment_program_data {
+	unsigned offset;
+	unsigned index;
+};
+
+struct nv20_fragment_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+
+	boolean translated;
+	boolean on_hw;
+	unsigned samplers;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct nv20_fragment_program_data *consts;
+	unsigned nr_consts;
+
+	struct pipe_buffer *buffer;
+
+	uint32_t fp_control;
+	uint32_t fp_reg_control;
+};
+
+
+struct nv20_depth_stencil_alpha_state {
+	struct {
+		uint32_t func;
+		uint32_t write_enable;
+		uint32_t test_enable;
+	} depth;
+
+	struct {
+		uint32_t enable;
+		uint32_t wmask;
+		uint32_t func;
+		uint32_t ref;
+		uint32_t vmask;
+		uint32_t fail;
+		uint32_t zfail;
+		uint32_t zpass;
+	} stencil;
+
+	struct {
+		uint32_t enabled;
+		uint32_t func;
+		uint32_t ref;
+	} alpha;
+};
+
+struct nv20_miptree {
+	struct pipe_texture base;
+
+	struct pipe_buffer *buffer;
+	uint total_size;
+
+	struct {
+		uint pitch;
+		uint *image_offset;
+	} level[PIPE_MAX_TEXTURE_LEVELS];
+};
+
+#endif
diff --git a/src/gallium/drivers/nv20/nv20_state_emit.c b/src/gallium/drivers/nv20/nv20_state_emit.c
new file mode 100644
index 00000000000..e8dd22925c4
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_state_emit.c
@@ -0,0 +1,303 @@
+#include "nv20_context.h"
+#include "nv20_state.h"
+
+static void nv20_state_emit_blend(struct nv20_context* nv20)
+{
+	struct nv20_blend_state *b = nv20->blend;
+
+	BEGIN_RING(kelvin, NV10TCL_DITHER_ENABLE, 1);
+	OUT_RING  (b->d_enable);
+
+	BEGIN_RING(kelvin, NV10TCL_BLEND_FUNC_ENABLE, 3);
+	OUT_RING  (b->b_enable);
+	OUT_RING  (b->b_srcfunc);
+	OUT_RING  (b->b_dstfunc);
+
+	BEGIN_RING(kelvin, NV10TCL_COLOR_MASK, 1);
+	OUT_RING  (b->c_mask);
+}
+
+static void nv20_state_emit_blend_color(struct nv20_context* nv20)
+{
+	struct pipe_blend_color *c = nv20->blend_color;
+
+	BEGIN_RING(kelvin, NV10TCL_BLEND_COLOR, 1);
+	OUT_RING  ((float_to_ubyte(c->color[3]) << 24)|
+		   (float_to_ubyte(c->color[0]) << 16)|
+		   (float_to_ubyte(c->color[1]) << 8) |
+		   (float_to_ubyte(c->color[2]) << 0));
+}
+
+static void nv20_state_emit_rast(struct nv20_context* nv20)
+{
+	struct nv20_rasterizer_state *r = nv20->rast;
+
+	BEGIN_RING(kelvin, NV10TCL_SHADE_MODEL, 2);
+	OUT_RING  (r->shade_model);
+	OUT_RING  (r->line_width);
+
+
+	BEGIN_RING(kelvin, NV10TCL_POINT_SIZE, 1);
+	OUT_RING  (r->point_size);
+
+	BEGIN_RING(kelvin, NV10TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (r->poly_mode_front);
+	OUT_RING  (r->poly_mode_back);
+
+
+	BEGIN_RING(kelvin, NV10TCL_CULL_FACE, 2);
+	OUT_RING  (r->cull_face);
+	OUT_RING  (r->front_face);
+
+	BEGIN_RING(kelvin, NV10TCL_LINE_SMOOTH_ENABLE, 2);
+	OUT_RING  (r->line_smooth_en);
+	OUT_RING  (r->poly_smooth_en);
+
+	BEGIN_RING(kelvin, NV10TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (r->cull_face_en);
+}
+
+static void nv20_state_emit_dsa(struct nv20_context* nv20)
+{
+	struct nv20_depth_stencil_alpha_state *d = nv20->dsa;
+
+	BEGIN_RING(kelvin, NV10TCL_DEPTH_FUNC, 1);
+	OUT_RING (d->depth.func);
+
+	BEGIN_RING(kelvin, NV10TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING (d->depth.write_enable);
+
+	BEGIN_RING(kelvin, NV10TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING (d->depth.test_enable);
+
+#if 0
+	BEGIN_RING(kelvin, NV10TCL_STENCIL_ENABLE, 1);
+	OUT_RING (d->stencil.enable);
+	BEGIN_RING(kelvin, NV10TCL_STENCIL_MASK, 7);
+	OUT_RINGp ((uint32_t *)&(d->stencil.wmask), 7);
+#endif
+
+	BEGIN_RING(kelvin, NV10TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING (d->alpha.enabled);
+
+	BEGIN_RING(kelvin, NV10TCL_ALPHA_FUNC_FUNC, 1);
+	OUT_RING (d->alpha.func);
+
+	BEGIN_RING(kelvin, NV10TCL_ALPHA_FUNC_REF, 1);
+	OUT_RING (d->alpha.ref);
+}
+
+static void nv20_state_emit_viewport(struct nv20_context* nv20)
+{
+}
+
+static void nv20_state_emit_scissor(struct nv20_context* nv20)
+{
+	// XXX this is so not working
+/*	struct pipe_scissor_state *s = nv20->scissor;
+	BEGIN_RING(kelvin, NV10TCL_SCISSOR_HORIZ, 2);
+	OUT_RING  (((s->maxx - s->minx) << 16) | s->minx);
+	OUT_RING  (((s->maxy - s->miny) << 16) | s->miny);*/
+}
+
+static void nv20_state_emit_framebuffer(struct nv20_context* nv20)
+{
+	struct pipe_framebuffer_state* fb = nv20->framebuffer;
+	struct pipe_surface *rt, *zeta = NULL;
+	uint32_t rt_format, w, h;
+	int colour_format = 0, zeta_format = 0;
+
+	w = fb->cbufs[0]->width;
+	h = fb->cbufs[0]->height;
+	colour_format = fb->cbufs[0]->format;
+	rt = fb->cbufs[0];
+
+	if (fb->zsbuf) {
+		if (colour_format) {
+			assert(w == fb->zsbuf->width);
+			assert(h == fb->zsbuf->height);
+		} else {
+			w = fb->zsbuf->width;
+			h = fb->zsbuf->height;
+		}
+
+		zeta_format = fb->zsbuf->format;
+		zeta = fb->zsbuf;
+	}
+
+	rt_format = NV10TCL_RT_FORMAT_TYPE_LINEAR;
+
+	switch (colour_format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case 0:
+		rt_format |= NV10TCL_RT_FORMAT_COLOR_A8R8G8B8;
+		break;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		rt_format |= NV10TCL_RT_FORMAT_COLOR_R5G6B5;
+		break;
+	default:
+		assert(0);
+	}
+
+	if (zeta) {
+		BEGIN_RING(kelvin, NV10TCL_RT_PITCH, 1);
+		OUT_RING  (rt->stride | (zeta->stride << 16));
+	} else {
+		BEGIN_RING(kelvin, NV10TCL_RT_PITCH, 1);
+		OUT_RING  (rt->stride | (rt->stride << 16));
+	}
+
+	nv20->rt[0] = rt->buffer;
+
+	if (zeta_format)
+	{
+		nv20->zeta = zeta->buffer;
+	}
+
+	BEGIN_RING(kelvin, NV10TCL_RT_HORIZ, 3);
+	OUT_RING  ((w << 16) | 0);
+	OUT_RING  ((h << 16) | 0);
+	OUT_RING  (rt_format);
+	BEGIN_RING(kelvin, NV10TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	OUT_RING  (((w - 1) << 16) | 0 | 0x08000800);
+	OUT_RING  (((h - 1) << 16) | 0 | 0x08000800);
+}
+
+static void nv20_vertex_layout(struct nv20_context *nv20)
+{
+	struct nv20_fragment_program *fp = nv20->fragprog.current;
+	uint32_t src = 0;
+	int i;
+	struct vertex_info *vinfo = &nv20->vertex_info;
+
+	memset(vinfo, 0, sizeof(*vinfo));
+
+	for (i = 0; i < fp->info.num_inputs; i++) {
+		switch (fp->info.input_semantic_name[i]) {
+			case TGSI_SEMANTIC_POSITION:
+				draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_LINEAR, src++);
+				break;
+			case TGSI_SEMANTIC_COLOR:
+				draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_LINEAR, src++);
+				break;
+			default:
+			case TGSI_SEMANTIC_GENERIC:
+				draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src++);
+				break;
+			case TGSI_SEMANTIC_FOG:
+				draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src++);
+				break;
+		}
+	}
+	draw_compute_vertex_size(vinfo);
+}
+
+void
+nv20_emit_hw_state(struct nv20_context *nv20)
+{
+	int i;
+
+	if (nv20->dirty & NV20_NEW_VERTPROG) {
+		//nv20_vertprog_bind(nv20, nv20->vertprog.current);
+		nv20->dirty &= ~NV20_NEW_VERTPROG;
+	}
+
+	if (nv20->dirty & NV20_NEW_FRAGPROG) {
+		nv20_fragprog_bind(nv20, nv20->fragprog.current);
+		/*XXX: clear NV20_NEW_FRAGPROG if no new program uploaded */
+		nv20->dirty_samplers |= (1<<10);
+		nv20->dirty_samplers = 0;
+	}
+
+	if (nv20->dirty_samplers || (nv20->dirty & NV20_NEW_FRAGPROG)) {
+		nv20_fragtex_bind(nv20);
+		nv20->dirty &= ~NV20_NEW_FRAGPROG;
+	}
+
+	if (nv20->dirty & NV20_NEW_VTXARRAYS) {
+		nv20->dirty &= ~NV20_NEW_VTXARRAYS;
+		nv20_vertex_layout(nv20);
+		nv20_vtxbuf_bind(nv20);
+	}
+
+	if (nv20->dirty & NV20_NEW_BLEND) {
+		nv20->dirty &= ~NV20_NEW_BLEND;
+		nv20_state_emit_blend(nv20);
+	}
+
+	if (nv20->dirty & NV20_NEW_BLENDCOL) {
+		nv20->dirty &= ~NV20_NEW_BLENDCOL;
+		nv20_state_emit_blend_color(nv20);
+	}
+
+	if (nv20->dirty & NV20_NEW_RAST) {
+		nv20->dirty &= ~NV20_NEW_RAST;
+		nv20_state_emit_rast(nv20);
+	}
+
+	if (nv20->dirty & NV20_NEW_DSA) {
+		nv20->dirty &= ~NV20_NEW_DSA;
+		nv20_state_emit_dsa(nv20);
+	}
+
+ 	if (nv20->dirty & NV20_NEW_VIEWPORT) {
+		nv20->dirty &= ~NV20_NEW_VIEWPORT;
+		nv20_state_emit_viewport(nv20);
+	}
+
+ 	if (nv20->dirty & NV20_NEW_SCISSOR) {
+		nv20->dirty &= ~NV20_NEW_SCISSOR;
+		nv20_state_emit_scissor(nv20);
+	}
+
+ 	if (nv20->dirty & NV20_NEW_FRAMEBUFFER) {
+		nv20->dirty &= ~NV20_NEW_FRAMEBUFFER;
+		nv20_state_emit_framebuffer(nv20);
+	}
+
+	/* Emit relocs for every referenced buffer.
+	 * This is to ensure the bufmgr has an accurate idea of how
+	 * the buffer is used.  This isn't very efficient, but we don't
+	 * seem to take a significant performance hit.  Will be improved
+	 * at some point.  Vertex arrays are emitted by nv20_vbo.c
+	 */
+
+	/* Render target */
+// XXX figre out who's who for NV10TCL_DMA_* and fill accordingly
+//	BEGIN_RING(kelvin, NV10TCL_DMA_COLOR0, 1);
+//	OUT_RELOCo(nv20->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(kelvin, NV10TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(nv20->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	if (nv20->zeta) {
+// XXX
+//		BEGIN_RING(kelvin, NV10TCL_DMA_ZETA, 1);
+//		OUT_RELOCo(nv20->zeta, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(kelvin, NV10TCL_ZETA_OFFSET, 1);
+		OUT_RELOCl(nv20->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		/* XXX for when we allocate LMA on nv17 */
+/*		BEGIN_RING(kelvin, NV10TCL_LMA_DEPTH_BUFFER_OFFSET, 1);
+		OUT_RELOCl(nv20->zeta + lma_offset);*/
+	}
+
+	/* Vertex buffer */
+	BEGIN_RING(kelvin, NV10TCL_DMA_VTXBUF0, 1);
+	OUT_RELOCo(nv20->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(kelvin, NV10TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(nv20->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	/* Texture images */
+	for (i = 0; i < 2; i++) {
+		if (!(nv20->fp_samplers & (1 << i)))
+			continue;
+		BEGIN_RING(kelvin, NV10TCL_TX_OFFSET(i), 1);
+		OUT_RELOCl(nv20->tex[i].buffer, 0, NOUVEAU_BO_VRAM |
+			   NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+		BEGIN_RING(kelvin, NV10TCL_TX_FORMAT(i), 1);
+		OUT_RELOCd(nv20->tex[i].buffer, nv20->tex[i].format,
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
+			   NOUVEAU_BO_OR, NV10TCL_TX_FORMAT_DMA0,
+			   NV10TCL_TX_FORMAT_DMA1);
+	}
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_surface.c b/src/gallium/drivers/nv20/nv20_surface.c
new file mode 100644
index 00000000000..41b6d6ad35c
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_surface.c
@@ -0,0 +1,64 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "nv20_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
+#include "util/u_tile.h"
+
+static void
+nv20_surface_copy(struct pipe_context *pipe, unsigned do_flip,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	struct nouveau_winsys *nvws = nv20->nvws;
+
+	nvws->surface_copy(nvws, dest, destx, desty, src, srcx, srcy,
+			   width, height);
+}
+
+static void
+nv20_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	struct nouveau_winsys *nvws = nv20->nvws;
+
+	nvws->surface_fill(nvws, dest, destx, desty, width, height, value);
+}
+
+void
+nv20_init_surface_functions(struct nv20_context *nv20)
+{
+	nv20->pipe.surface_copy = nv20_surface_copy;
+	nv20->pipe.surface_fill = nv20_surface_fill;
+}
diff --git a/src/gallium/drivers/nv20/nv20_vbo.c b/src/gallium/drivers/nv20/nv20_vbo.c
new file mode 100644
index 00000000000..4edc4efebd8
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_vbo.c
@@ -0,0 +1,77 @@
+#include "draw/draw_context.h"
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "nv20_context.h"
+#include "nv20_state.h"
+
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_pushbuf.h"
+
+boolean nv20_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *indexBuffer,
+                    unsigned indexSize,
+                    unsigned prim, unsigned start, unsigned count)
+{
+	struct nv20_context *nv20 = nv20_context( pipe );
+	struct draw_context *draw = nv20->draw;
+	unsigned i;
+
+	nv20_emit_hw_state(nv20);
+
+	/*
+	 * Map vertex buffers
+	 */
+	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+		if (nv20->vtxbuf[i].buffer) {
+			void *buf
+				= pipe->winsys->buffer_map(pipe->winsys,
+						nv20->vtxbuf[i].buffer,
+						PIPE_BUFFER_USAGE_CPU_READ);
+			draw_set_mapped_vertex_buffer(draw, i, buf);
+		}
+	}
+	/* Map index buffer, if present */
+	if (indexBuffer) {
+		void *mapped_indexes
+			= pipe->winsys->buffer_map(pipe->winsys, indexBuffer,
+					PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_element_buffer(draw, indexSize, mapped_indexes);
+	}
+	else {
+		/* no index/element buffer */
+		draw_set_mapped_element_buffer(draw, 0, NULL);
+	}
+
+	draw_set_mapped_constant_buffer(draw,
+					nv20->constbuf[PIPE_SHADER_VERTEX],
+					nv20->constbuf_nr[PIPE_SHADER_VERTEX]);
+
+	/* draw! */
+	draw_arrays(nv20->draw, prim, start, count);
+
+	/*
+	 * unmap vertex/index buffers
+	 */
+	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+		if (nv20->vtxbuf[i].buffer) {
+			pipe->winsys->buffer_unmap(pipe->winsys, nv20->vtxbuf[i].buffer);
+			draw_set_mapped_vertex_buffer(draw, i, NULL);
+		}
+	}
+	if (indexBuffer) {
+		pipe->winsys->buffer_unmap(pipe->winsys, indexBuffer);
+		draw_set_mapped_element_buffer(draw, 0, NULL);
+	}
+
+	return TRUE;
+}
+
+boolean nv20_draw_arrays( struct pipe_context *pipe,
+				 unsigned prim, unsigned start, unsigned count)
+{
+	return nv20_draw_elements(pipe, NULL, 0, prim, start, count);
+}
+
+
+
diff --git a/src/gallium/drivers/nv20/nv20_vertprog.c b/src/gallium/drivers/nv20/nv20_vertprog.c
new file mode 100644
index 00000000000..a885fcd7a56
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_vertprog.c
@@ -0,0 +1,838 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_dump.h"
+
+#include "nv20_context.h"
+#include "nv20_state.h"
+
+/* TODO (at least...):
+ *  1. Indexed consts  + ARL
+ *  2. Arb. swz/negation
+ *  3. NV_vp11, NV_vp2, NV_vp3 features
+ *       - extra arith opcodes
+ *       - branching
+ *       - texture sampling
+ *       - indexed attribs
+ *       - indexed results
+ *  4. bugs
+ */
+
+#define SWZ_X 0
+#define SWZ_Y 1
+#define SWZ_Z 2
+#define SWZ_W 3
+#define MASK_X 8
+#define MASK_Y 4
+#define MASK_Z 2
+#define MASK_W 1
+#define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
+#define DEF_SCALE 0
+#define DEF_CTEST 0
+#include "nv20_shader.h"
+
+#define swz(s,x,y,z,w) nv20_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
+#define neg(s) nv20_sr_neg((s))
+#define abs(s) nv20_sr_abs((s))
+
+struct nv20_vpc {
+	struct nv20_vertex_program *vp;
+
+	struct nv20_vertex_program_exec *vpi;
+
+	unsigned output_map[PIPE_MAX_SHADER_OUTPUTS];
+
+	int high_temp;
+	int temp_temp_count;
+
+	struct nv20_sreg *imm;
+	unsigned nr_imm;
+};
+
+static struct nv20_sreg
+temp(struct nv20_vpc *vpc)
+{
+	int idx;
+
+	idx  = vpc->temp_temp_count++;
+	idx += vpc->high_temp + 1;
+	return nv20_sr(NV30SR_TEMP, idx);
+}
+
+static struct nv20_sreg
+constant(struct nv20_vpc *vpc, int pipe, float x, float y, float z, float w)
+{
+	struct nv20_vertex_program *vp = vpc->vp;
+	struct nv20_vertex_program_data *vpd;
+	int idx;
+
+	if (pipe >= 0) {
+		for (idx = 0; idx < vp->nr_consts; idx++) {
+			if (vp->consts[idx].index == pipe)
+				return nv20_sr(NV30SR_CONST, idx);
+		}
+	}
+
+	idx = vp->nr_consts++;
+	vp->consts = realloc(vp->consts, sizeof(*vpd) * vp->nr_consts);
+	vpd = &vp->consts[idx];
+
+	vpd->index = pipe;
+	vpd->value[0] = x;
+	vpd->value[1] = y;
+	vpd->value[2] = z;
+	vpd->value[3] = w;
+	return nv20_sr(NV30SR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nv20_vp_arith((cc), (s), NV30_VP_INST_##o, (d), (m), (s0), (s1), (s2))
+
+static void
+emit_src(struct nv20_vpc *vpc, uint32_t *hw, int pos, struct nv20_sreg src)
+{
+	struct nv20_vertex_program *vp = vpc->vp;
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NV30SR_TEMP:
+		sr |= (NV30_VP_SRC_REG_TYPE_TEMP << NV30_VP_SRC_REG_TYPE_SHIFT);
+		sr |= (src.index << NV30_VP_SRC_TEMP_SRC_SHIFT);
+		break;
+	case NV30SR_INPUT:
+		sr |= (NV30_VP_SRC_REG_TYPE_INPUT <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		vp->ir |= (1 << src.index);
+		hw[1] |= (src.index << NV30_VP_INST_INPUT_SRC_SHIFT);
+		break;
+	case NV30SR_CONST:
+		sr |= (NV30_VP_SRC_REG_TYPE_CONST <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		assert(vpc->vpi->const_index == -1 ||
+		       vpc->vpi->const_index == src.index);
+		vpc->vpi->const_index = src.index;
+		break;
+	case NV30SR_NONE:
+		sr |= (NV30_VP_SRC_REG_TYPE_INPUT <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NV30_VP_SRC_NEGATE;
+
+	if (src.abs)
+		hw[0] |= (1 << (21 + pos));
+
+	sr |= ((src.swz[0] << NV30_VP_SRC_SWZ_X_SHIFT) |
+	       (src.swz[1] << NV30_VP_SRC_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NV30_VP_SRC_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NV30_VP_SRC_SWZ_W_SHIFT));
+
+/*
+ * |VVV|
+ * d�.�b
+ *  \u/
+ *
+ */
+
+	switch (pos) {
+	case 0:
+		hw[1] |= ((sr & NV30_VP_SRC0_HIGH_MASK) >>
+			  NV30_VP_SRC0_HIGH_SHIFT) << NV30_VP_INST_SRC0H_SHIFT;
+		hw[2] |= (sr & NV30_VP_SRC0_LOW_MASK) <<
+			  NV30_VP_INST_SRC0L_SHIFT;
+		break;
+	case 1:
+		hw[2] |= sr << NV30_VP_INST_SRC1_SHIFT;
+		break;
+	case 2:
+		hw[2] |= ((sr & NV30_VP_SRC2_HIGH_MASK) >>
+			  NV30_VP_SRC2_HIGH_SHIFT) << NV30_VP_INST_SRC2H_SHIFT;
+		hw[3] |= (sr & NV30_VP_SRC2_LOW_MASK) <<
+			  NV30_VP_INST_SRC2L_SHIFT;
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+emit_dst(struct nv20_vpc *vpc, uint32_t *hw, int slot, struct nv20_sreg dst)
+{
+	struct nv20_vertex_program *vp = vpc->vp;
+
+	switch (dst.type) {
+	case NV30SR_TEMP:
+		hw[0] |= (dst.index << NV30_VP_INST_DEST_TEMP_ID_SHIFT);
+		break;
+	case NV30SR_OUTPUT:
+		switch (dst.index) {
+		case NV30_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
+		case NV30_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
+		case NV30_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
+		case NV30_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
+		case NV30_VP_INST_DEST_FOGC : vp->or |= (1 << 4); break;
+		case NV30_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
+		case NV30_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
+		case NV30_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
+		case NV30_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
+		case NV30_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
+		case NV30_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
+		case NV30_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
+		case NV30_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
+		case NV30_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
+		default:
+			break;
+		}
+
+		hw[3] |= (dst.index << NV30_VP_INST_DEST_SHIFT);
+		hw[0] |= NV30_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
+
+		/*XXX: no way this is entirely correct, someone needs to
+		 *     figure out what exactly it is.
+		 */
+		hw[3] |= 0x800;
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+nv20_vp_arith(struct nv20_vpc *vpc, int slot, int op,
+	      struct nv20_sreg dst, int mask,
+	      struct nv20_sreg s0, struct nv20_sreg s1,
+	      struct nv20_sreg s2)
+{
+	struct nv20_vertex_program *vp = vpc->vp;
+	uint32_t *hw;
+
+	vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
+	vpc->vpi = &vp->insns[vp->nr_insns - 1];
+	memset(vpc->vpi, 0, sizeof(*vpc->vpi));
+	vpc->vpi->const_index = -1;
+
+	hw = vpc->vpi->data;
+
+	hw[0] |= (NV30_VP_INST_COND_TR << NV30_VP_INST_COND_SHIFT);
+	hw[0] |= ((0 << NV30_VP_INST_COND_SWZ_X_SHIFT) |
+		  (1 << NV30_VP_INST_COND_SWZ_Y_SHIFT) |
+		  (2 << NV30_VP_INST_COND_SWZ_Z_SHIFT) |
+		  (3 << NV30_VP_INST_COND_SWZ_W_SHIFT));
+
+	hw[1] |= (op << NV30_VP_INST_VEC_OPCODE_SHIFT);
+//	hw[3] |= NV30_VP_INST_SCA_DEST_TEMP_MASK;
+//	hw[3] |= (mask << NV30_VP_INST_VEC_WRITEMASK_SHIFT);
+
+	if (dst.type == NV30SR_OUTPUT) {
+		if (slot)
+			hw[3] |= (mask << NV30_VP_INST_SDEST_WRITEMASK_SHIFT);
+		else
+			hw[3] |= (mask << NV30_VP_INST_VDEST_WRITEMASK_SHIFT);
+	} else {
+		if (slot)
+			hw[3] |= (mask << NV30_VP_INST_STEMP_WRITEMASK_SHIFT);
+		else
+			hw[3] |= (mask << NV30_VP_INST_VTEMP_WRITEMASK_SHIFT);
+	}
+
+	emit_dst(vpc, hw, slot, dst);
+	emit_src(vpc, hw, 0, s0);
+	emit_src(vpc, hw, 1, s1);
+	emit_src(vpc, hw, 2, s2);
+}
+
+static INLINE struct nv20_sreg
+tgsi_src(struct nv20_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
+	struct nv20_sreg src;
+
+	switch (fsrc->SrcRegister.File) {
+	case TGSI_FILE_INPUT:
+		src = nv20_sr(NV30SR_INPUT, fsrc->SrcRegister.Index);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(vpc, fsrc->SrcRegister.Index, 0, 0, 0, 0);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		src = vpc->imm[fsrc->SrcRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		if (vpc->high_temp < fsrc->SrcRegister.Index)
+			vpc->high_temp = fsrc->SrcRegister.Index;
+		src = nv20_sr(NV30SR_TEMP, fsrc->SrcRegister.Index);
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->SrcRegisterExtMod.Absolute;
+	src.negate = fsrc->SrcRegister.Negate;
+	src.swz[0] = fsrc->SrcRegister.SwizzleX;
+	src.swz[1] = fsrc->SrcRegister.SwizzleY;
+	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
+	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	return src;
+}
+
+static INLINE struct nv20_sreg
+tgsi_dst(struct nv20_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
+	struct nv20_sreg dst;
+
+	switch (fdst->DstRegister.File) {
+	case TGSI_FILE_OUTPUT:
+		dst = nv20_sr(NV30SR_OUTPUT,
+			      vpc->output_map[fdst->DstRegister.Index]);
+
+		break;
+	case TGSI_FILE_TEMPORARY:
+		dst = nv20_sr(NV30SR_TEMP, fdst->DstRegister.Index);
+		if (vpc->high_temp < dst.index)
+			vpc->high_temp = dst.index;
+		break;
+	default:
+		NOUVEAU_ERR("bad dst file\n");
+		break;
+	}
+
+	return dst;
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
+	return mask;
+}
+
+static boolean
+nv20_vertprog_parse_instruction(struct nv20_vpc *vpc,
+				const struct tgsi_full_instruction *finst)
+{
+	struct nv20_sreg src[3], dst, tmp;
+	struct nv20_sreg none = nv20_sr(NV30SR_NONE, 0);
+	int mask;
+	int ai = -1, ci = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	vpc->temp_temp_count = 0;
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(vpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
+				ai = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		/*XXX: index comparison is broken now that consts come from
+		 *     two different register files.
+		 */
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_IMMEDIATE:
+			if (ci == -1 || ci == fsrc->SrcRegister.Index) {
+				ci = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(vpc, &finst->FullDstRegisters[0]);
+	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(vpc, 0, OP_MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, src[1]);
+		break;
+	case TGSI_OPCODE_ARL:
+		arith(vpc, 0, OP_ARL, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(vpc, 0, OP_DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(vpc, 0, OP_DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		arith(vpc, 0, OP_DPH, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(vpc, 0, OP_DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(vpc, 1, OP_EX2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_EXP:
+		arith(vpc, 1, OP_EXP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(vpc, 0, OP_FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(vpc, 0, OP_FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(vpc, 1, OP_LG2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LIT:
+		arith(vpc, 1, OP_LIT, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LOG:
+		arith(vpc, 1, OP_LOG, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(vpc, 0, OP_MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(vpc, 0, OP_MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(vpc, 0, OP_MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(vpc, 0, OP_MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(vpc, 0, OP_MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_POW:
+		tmp = temp(vpc);
+		arith(vpc, 1, OP_LG2, tmp, MASK_X, none, none,
+		      swz(src[0], X, X, X, X));
+		arith(vpc, 0, OP_MUL, tmp, MASK_X, swz(tmp, X, X, X, X),
+		      swz(src[1], X, X, X, X), none);
+		arith(vpc, 1, OP_EX2, dst, mask, none, none,
+		      swz(tmp, X, X, X, X));
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(vpc, 1, OP_RCP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_RET:
+		break;
+	case TGSI_OPCODE_RSQ:
+		arith(vpc, 1, OP_RSQ, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(vpc, 0, OP_SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGT:
+		arith(vpc, 0, OP_SGT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(vpc, 0, OP_SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, neg(src[1]));
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(vpc);
+		arith(vpc, 0, OP_MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(vpc, 0, OP_MAD, dst, (mask & ~MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv20_vertprog_parse_decl_output(struct nv20_vpc *vpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	int hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NV30_VP_INST_DEST_POS;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV30_VP_INST_DEST_COL0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV30_VP_INST_DEST_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_BCOLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV30_VP_INST_DEST_BFC0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV30_VP_INST_DEST_BFC1;
+		} else {
+			NOUVEAU_ERR("bad bcolour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NV30_VP_INST_DEST_FOGC;
+		break;
+	case TGSI_SEMANTIC_PSIZE:
+		hw = NV30_VP_INST_DEST_PSZ;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.SemanticIndex <= 7) {
+			hw = NV30_VP_INST_DEST_TC(fdec->Semantic.SemanticIndex);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	vpc->output_map[fdec->DeclarationRange.First] = hw;
+	return TRUE;
+}
+
+static boolean
+nv20_vertprog_prepare(struct nv20_vpc *vpc)
+{
+	struct tgsi_parse_context p;
+	int nr_imm = 0;
+
+	tgsi_parse_init(&p, vpc->vp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+			nr_imm++;
+			break;
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	if (nr_imm) {
+		vpc->imm = CALLOC(nr_imm, sizeof(struct nv20_sreg));
+		assert(vpc->imm);
+	}
+
+	return TRUE;
+}
+
+static void
+nv20_vertprog_translate(struct nv20_context *nv20,
+			struct nv20_vertex_program *vp)
+{
+	struct tgsi_parse_context parse;
+	struct nv20_vpc *vpc = NULL;
+
+	tgsi_dump(vp->pipe.tokens,0);
+
+	vpc = CALLOC(1, sizeof(struct nv20_vpc));
+	if (!vpc)
+		return;
+	vpc->vp = vp;
+	vpc->high_temp = -1;
+
+	if (!nv20_vertprog_prepare(vpc)) {
+		FREE(vpc);
+		return;
+	}
+
+	tgsi_parse_init(&parse, vp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+			fdec = &parse.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_OUTPUT:
+				if (!nv20_vertprog_parse_decl_output(vpc, fdec))
+					goto out_err;
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			const struct tgsi_full_immediate *imm;
+
+			imm = &parse.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+//			assert(imm->Immediate.Size == 4);
+			vpc->imm[vpc->nr_imm++] =
+				constant(vpc, -1,
+					 imm->u.ImmediateFloat32[0].Float,
+					 imm->u.ImmediateFloat32[1].Float,
+					 imm->u.ImmediateFloat32[2].Float,
+					 imm->u.ImmediateFloat32[3].Float);
+		}
+			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+			finst = &parse.FullToken.FullInstruction;
+			if (!nv20_vertprog_parse_instruction(vpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	vp->insns[vp->nr_insns - 1].data[3] |= NV30_VP_INST_LAST;
+	vp->translated = TRUE;
+out_err:
+	tgsi_parse_free(&parse);
+	FREE(vpc);
+}
+
+static boolean
+nv20_vertprog_validate(struct nv20_context *nv20)
+{ 
+	struct nouveau_winsys *nvws = nv20->nvws;
+	struct pipe_winsys *ws = nv20->pipe.winsys;
+	struct nouveau_grobj *rankine = nv20->screen->rankine;
+	struct nv20_vertex_program *vp;
+	struct pipe_buffer *constbuf;
+	boolean upload_code = FALSE, upload_data = FALSE;
+	int i;
+
+	vp = nv20->vertprog;
+	constbuf = nv20->constbuf[PIPE_SHADER_VERTEX];
+
+	/* Translate TGSI shader into hw bytecode */
+	if (!vp->translated) {
+		nv20_vertprog_translate(nv20, vp);
+		if (!vp->translated)
+			return FALSE;
+	}
+
+	/* Allocate hw vtxprog exec slots */
+	if (!vp->exec) {
+		struct nouveau_resource *heap = nv20->screen->vp_exec_heap;
+		struct nouveau_stateobj *so;
+		uint vplen = vp->nr_insns;
+
+		if (nvws->res_alloc(heap, vplen, vp, &vp->exec)) {
+			while (heap->next && heap->size < vplen) {
+				struct nv20_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->exec);
+			}
+
+			if (nvws->res_alloc(heap, vplen, vp, &vp->exec))
+				assert(0);
+		}
+
+		so = so_new(2, 0);
+		so_method(so, rankine, NV34TCL_VP_START_FROM_ID, 1);
+		so_data  (so, vp->exec->start);
+		so_ref(so, &vp->so);
+
+		upload_code = TRUE;
+	}
+
+	/* Allocate hw vtxprog const slots */
+	if (vp->nr_consts && !vp->data) {
+		struct nouveau_resource *heap = nv20->screen->vp_data_heap;
+
+		if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data)) {
+			while (heap->next && heap->size < vp->nr_consts) {
+				struct nv20_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->data);
+			}
+
+			if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data))
+				assert(0);
+		}
+
+		/*XXX: handle this some day */
+		assert(vp->data->start >= vp->data_start_min);
+
+		upload_data = TRUE;
+		if (vp->data_start != vp->data->start)
+			upload_code = TRUE;
+	}
+
+	/* If exec or data segments moved we need to patch the program to
+	 * fixup offsets and register IDs.
+	 */
+	if (vp->exec_start != vp->exec->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv20_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->has_branch_offset) {
+				assert(0);
+			}
+		}
+
+		vp->exec_start = vp->exec->start;
+	}
+
+	if (vp->nr_consts && vp->data_start != vp->data->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv20_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->const_index >= 0) {
+				vpi->data[1] &= ~NV30_VP_INST_CONST_SRC_MASK;
+				vpi->data[1] |=
+					(vpi->const_index + vp->data->start) <<
+					NV30_VP_INST_CONST_SRC_SHIFT;
+
+			}
+		}
+
+		vp->data_start = vp->data->start;
+	}
+
+	/* Update + Upload constant values */
+	if (vp->nr_consts) {
+		float *map = NULL;
+
+		if (constbuf) {
+			map = ws->buffer_map(ws, constbuf,
+					     PIPE_BUFFER_USAGE_CPU_READ);
+		}
+
+		for (i = 0; i < vp->nr_consts; i++) {
+			struct nv20_vertex_program_data *vpd = &vp->consts[i];
+
+			if (vpd->index >= 0) {
+				if (!upload_data &&
+				    !memcmp(vpd->value, &map[vpd->index * 4],
+					    4 * sizeof(float)))
+					continue;
+				memcpy(vpd->value, &map[vpd->index * 4],
+				       4 * sizeof(float));
+			}
+
+			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_CONST_ID, 5);
+			OUT_RING  (i + vp->data->start);
+			OUT_RINGp ((uint32_t *)vpd->value, 4);
+		}
+
+		if (constbuf) {
+			ws->buffer_unmap(ws, constbuf);
+		}
+	}
+
+	/* Upload vtxprog */
+	if (upload_code) {
+#if 0
+		for (i = 0; i < vp->nr_insns; i++) {
+			NOUVEAU_MSG("VP inst %d: 0x%08x 0x%08x 0x%08x 0x%08x\n",
+				i, vp->insns[i].data[0], vp->insns[i].data[1],
+				vp->insns[i].data[2], vp->insns[i].data[3]);
+		}
+#endif
+		BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_FROM_ID, 1);
+		OUT_RING  (vp->exec->start);
+		for (i = 0; i < vp->nr_insns; i++) {
+			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_INST(0), 4);
+			OUT_RINGp (vp->insns[i].data, 4);
+		}
+	}
+
+	if (vp->so != nv20->state.hw[NV30_STATE_VERTPROG]) {
+		so_ref(vp->so, &nv20->state.hw[NV30_STATE_VERTPROG]);
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+void
+nv20_vertprog_destroy(struct nv20_context *nv20, struct nv20_vertex_program *vp)
+{
+	struct nouveau_winsys *nvws = nv20->screen->nvws;
+
+	vp->translated = FALSE;
+
+	if (vp->nr_insns) {
+		FREE(vp->insns);
+		vp->insns = NULL;
+		vp->nr_insns = 0;
+	}
+
+	if (vp->nr_consts) {
+		FREE(vp->consts);
+		vp->consts = NULL;
+		vp->nr_consts = 0;
+	}
+
+	nvws->res_free(&vp->exec);
+	vp->exec_start = 0;
+	nvws->res_free(&vp->data);
+	vp->data_start = 0;
+	vp->data_start_min = 0;
+
+	vp->ir = vp->or = 0;
+	so_ref(NULL, &vp->so);
+}
+
+struct nv20_state_entry nv20_state_vertprog = {
+	.validate = nv20_vertprog_validate,
+	.dirty = {
+		.pipe = NV30_NEW_VERTPROG /*| NV30_NEW_UCP*/,
+		.hw = NV30_STATE_VERTPROG,
+	}
+};
diff --git a/src/gallium/drivers/nv30/Makefile b/src/gallium/drivers/nv30/Makefile
new file mode 100644
index 00000000000..69f2790dfe2
--- /dev/null
+++ b/src/gallium/drivers/nv30/Makefile
@@ -0,0 +1,37 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv30
+
+DRIVER_SOURCES = \
+	nv30_clear.c \
+	nv30_context.c \
+	nv30_draw.c \
+	nv30_fragprog.c \
+	nv30_fragtex.c \
+	nv30_miptree.c \
+	nv30_query.c \
+	nv30_screen.c \
+	nv30_state.c \
+	nv30_state_blend.c \
+	nv30_state_emit.c \
+	nv30_state_fb.c \
+	nv30_state_rasterizer.c \
+	nv30_state_scissor.c \
+	nv30_state_stipple.c \
+	nv30_state_viewport.c \
+	nv30_state_zsa.c \
+	nv30_surface.c \
+	nv30_vbo.c \
+	nv30_vertprog.c
+
+C_SOURCES = \
+	$(COMMON_SOURCES) \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/drivers/nv30/nv30_clear.c b/src/gallium/drivers/nv30/nv30_clear.c
new file mode 100644
index 00000000000..8c3ca204d58
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_clear.c
@@ -0,0 +1,13 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv30_context.h"
+
+void
+nv30_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, clearValue);
+	ps->status = PIPE_SURFACE_STATUS_CLEAR;
+}
diff --git a/src/gallium/drivers/nv30/nv30_context.c b/src/gallium/drivers/nv30/nv30_context.c
new file mode 100644
index 00000000000..2bff28aca9c
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_context.c
@@ -0,0 +1,72 @@
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+
+#include "nv30_context.h"
+#include "nv30_screen.h"
+
+static void
+nv30_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	
+	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
+		BEGIN_RING(rankine, 0x1fd8, 1);
+		OUT_RING  (2);
+		BEGIN_RING(rankine, 0x1fd8, 1);
+		OUT_RING  (1);
+	}
+
+	FIRE_RING(fence);
+}
+
+static void
+nv30_destroy(struct pipe_context *pipe)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	if (nv30->draw)
+		draw_destroy(nv30->draw);
+	FREE(nv30);
+}
+
+struct pipe_context *
+nv30_create(struct pipe_screen *pscreen, unsigned pctx_id)
+{
+	struct nv30_screen *screen = nv30_screen(pscreen);
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv30_context *nv30;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nv30 = CALLOC(1, sizeof(struct nv30_context));
+	if (!nv30)
+		return NULL;
+	nv30->screen = screen;
+	nv30->pctx_id = pctx_id;
+
+	nv30->nvws = nvws;
+
+	nv30->pipe.winsys = ws;
+	nv30->pipe.screen = pscreen;
+	nv30->pipe.destroy = nv30_destroy;
+	nv30->pipe.draw_arrays = nv30_draw_arrays;
+	nv30->pipe.draw_elements = nv30_draw_elements;
+	nv30->pipe.clear = nv30_clear;
+	nv30->pipe.flush = nv30_flush;
+
+	nv30_init_query_functions(nv30);
+	nv30_init_surface_functions(nv30);
+	nv30_init_state_functions(nv30);
+
+	/* Create, configure, and install fallback swtnl path */
+	nv30->draw = draw_create();
+	draw_wide_point_threshold(nv30->draw, 9999999.0);
+	draw_wide_line_threshold(nv30->draw, 9999999.0);
+	draw_enable_line_stipple(nv30->draw, FALSE);
+	draw_enable_point_sprites(nv30->draw, FALSE);
+	draw_set_rasterize_stage(nv30->draw, nv30_draw_render_stage(nv30));
+
+	return &nv30->pipe;
+}
+	
diff --git a/src/gallium/drivers/nv30/nv30_context.h b/src/gallium/drivers/nv30/nv30_context.h
new file mode 100644
index 00000000000..b9337697008
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_context.h
@@ -0,0 +1,212 @@
+#ifndef __NV30_CONTEXT_H__
+#define __NV30_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+
+#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
+	struct nv30_screen *ctx = nv30->screen
+#include "nouveau/nouveau_push.h"
+#include "nouveau/nouveau_stateobj.h"
+
+#include "nv30_state.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+enum nv30_state_index {
+	NV30_STATE_FB = 0,
+	NV30_STATE_VIEWPORT = 1,
+	NV30_STATE_BLEND = 2,
+	NV30_STATE_RAST = 3,
+	NV30_STATE_ZSA = 4,
+	NV30_STATE_BCOL = 5,
+	NV30_STATE_CLIP = 6,
+	NV30_STATE_SCISSOR = 7,
+	NV30_STATE_STIPPLE = 8,
+	NV30_STATE_FRAGPROG = 9,
+	NV30_STATE_VERTPROG = 10,
+	NV30_STATE_FRAGTEX0 = 11,
+	NV30_STATE_FRAGTEX1 = 12,
+	NV30_STATE_FRAGTEX2 = 13,
+	NV30_STATE_FRAGTEX3 = 14,
+	NV30_STATE_FRAGTEX4 = 15,
+	NV30_STATE_FRAGTEX5 = 16,
+	NV30_STATE_FRAGTEX6 = 17,
+	NV30_STATE_FRAGTEX7 = 18,
+	NV30_STATE_FRAGTEX8 = 19,
+	NV30_STATE_FRAGTEX9 = 20,
+	NV30_STATE_FRAGTEX10 = 21,
+	NV30_STATE_FRAGTEX11 = 22,
+	NV30_STATE_FRAGTEX12 = 23,
+	NV30_STATE_FRAGTEX13 = 24,
+	NV30_STATE_FRAGTEX14 = 25,
+	NV30_STATE_FRAGTEX15 = 26,
+	NV30_STATE_VERTTEX0 = 27,
+	NV30_STATE_VERTTEX1 = 28,
+	NV30_STATE_VERTTEX2 = 29,
+	NV30_STATE_VERTTEX3 = 30,
+	NV30_STATE_VTXBUF = 31,
+	NV30_STATE_VTXFMT = 32,
+	NV30_STATE_VTXATTR = 33,
+	NV30_STATE_MAX = 34
+};
+
+#include "nv30_screen.h"
+
+#define NV30_NEW_BLEND		(1 <<  0)
+#define NV30_NEW_RAST		(1 <<  1)
+#define NV30_NEW_ZSA		(1 <<  2)
+#define NV30_NEW_SAMPLER	(1 <<  3)
+#define NV30_NEW_FB		(1 <<  4)
+#define NV30_NEW_STIPPLE	(1 <<  5)
+#define NV30_NEW_SCISSOR	(1 <<  6)
+#define NV30_NEW_VIEWPORT	(1 <<  7)
+#define NV30_NEW_BCOL		(1 <<  8)
+#define NV30_NEW_VERTPROG	(1 <<  9)
+#define NV30_NEW_FRAGPROG	(1 << 10)
+#define NV30_NEW_ARRAYS		(1 << 11)
+#define NV30_NEW_UCP		(1 << 12)
+
+struct nv30_rasterizer_state {
+	struct pipe_rasterizer_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv30_zsa_state {
+	struct pipe_depth_stencil_alpha_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv30_blend_state {
+	struct pipe_blend_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+
+struct nv30_state {
+	unsigned scissor_enabled;
+	unsigned stipple_enabled;
+	unsigned viewport_bypass;
+	unsigned fp_samplers;
+
+	uint64_t dirty;
+	struct nouveau_stateobj *hw[NV30_STATE_MAX];
+};
+
+struct nv30_context {
+	struct pipe_context pipe;
+
+	struct nouveau_winsys *nvws;
+	struct nv30_screen *screen;
+	unsigned pctx_id;
+
+	struct draw_context *draw;
+
+	/* HW state derived from pipe states */
+	struct nv30_state state;
+
+	/* Context state */
+	unsigned dirty;
+	struct pipe_scissor_state scissor;
+	unsigned stipple[32];
+	struct nv30_vertex_program *vertprog;
+	struct nv30_fragment_program *fragprog;
+	struct pipe_buffer *constbuf[PIPE_SHADER_TYPES];
+	unsigned constbuf_nr[PIPE_SHADER_TYPES];
+	struct nv30_rasterizer_state *rasterizer;
+	struct nv30_zsa_state *zsa;
+	struct nv30_blend_state *blend;
+	struct pipe_blend_color blend_colour;
+	struct pipe_viewport_state viewport;
+	struct pipe_framebuffer_state framebuffer;
+	struct pipe_buffer *idxbuf;
+	unsigned idxbuf_format;
+	struct nv30_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
+	struct nv30_miptree *tex_miptree[PIPE_MAX_SAMPLERS];
+	unsigned nr_samplers;
+	unsigned nr_textures;
+	unsigned dirty_samplers;
+	struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
+	unsigned vtxbuf_nr;
+	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
+	unsigned vtxelt_nr;
+	const unsigned *edgeflags;
+};
+
+static INLINE struct nv30_context *
+nv30_context(struct pipe_context *pipe)
+{
+	return (struct nv30_context *)pipe;
+}
+
+struct nv30_state_entry {
+	boolean (*validate)(struct nv30_context *nv30);
+	struct {
+		unsigned pipe;
+		unsigned hw;
+	} dirty;
+};
+
+extern void nv30_init_state_functions(struct nv30_context *nv30);
+extern void nv30_init_surface_functions(struct nv30_context *nv30);
+extern void nv30_init_query_functions(struct nv30_context *nv30);
+
+extern void nv30_screen_init_miptree_functions(struct pipe_screen *pscreen);
+
+/* nv30_draw.c */
+extern struct draw_stage *nv30_draw_render_stage(struct nv30_context *nv30);
+
+/* nv30_vertprog.c */
+extern void nv30_vertprog_destroy(struct nv30_context *,
+				  struct nv30_vertex_program *);
+
+/* nv30_fragprog.c */
+extern void nv30_fragprog_destroy(struct nv30_context *,
+				  struct nv30_fragment_program *);
+
+/* nv30_fragtex.c */
+extern void nv30_fragtex_bind(struct nv30_context *);
+
+/* nv30_state.c and friends */
+extern boolean nv30_state_validate(struct nv30_context *nv30);
+extern void nv30_state_emit(struct nv30_context *nv30);
+extern struct nv30_state_entry nv30_state_rasterizer;
+extern struct nv30_state_entry nv30_state_scissor;
+extern struct nv30_state_entry nv30_state_stipple;
+extern struct nv30_state_entry nv30_state_fragprog;
+extern struct nv30_state_entry nv30_state_vertprog;
+extern struct nv30_state_entry nv30_state_blend;
+extern struct nv30_state_entry nv30_state_blend_colour;
+extern struct nv30_state_entry nv30_state_zsa;
+extern struct nv30_state_entry nv30_state_viewport;
+extern struct nv30_state_entry nv30_state_framebuffer;
+extern struct nv30_state_entry nv30_state_fragtex;
+extern struct nv30_state_entry nv30_state_vbo;
+
+/* nv30_vbo.c */
+extern boolean nv30_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv30_draw_elements(struct pipe_context *pipe,
+				  struct pipe_buffer *indexBuffer,
+				  unsigned indexSize,
+				  unsigned mode, unsigned start,
+				  unsigned count);
+
+/* nv30_clear.c */
+extern void nv30_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+#endif
diff --git a/src/gallium/drivers/nv30/nv30_draw.c b/src/gallium/drivers/nv30/nv30_draw.c
new file mode 100644
index 00000000000..74fc138c051
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_draw.c
@@ -0,0 +1,61 @@
+#include "draw/draw_pipe.h"
+
+#include "nv30_context.h"
+
+struct nv30_draw_stage {
+	struct draw_stage draw;
+	struct nv30_context *nv30;
+};
+
+static void
+nv30_draw_point(struct draw_stage *draw, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv30_draw_line(struct draw_stage *draw, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv30_draw_tri(struct draw_stage *draw, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv30_draw_flush(struct draw_stage *draw, unsigned flags)
+{
+}
+
+static void
+nv30_draw_reset_stipple_counter(struct draw_stage *draw)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv30_draw_destroy(struct draw_stage *draw)
+{
+	FREE(draw);
+}
+
+struct draw_stage *
+nv30_draw_render_stage(struct nv30_context *nv30)
+{
+	struct nv30_draw_stage *nv30draw = CALLOC_STRUCT(nv30_draw_stage);
+
+	nv30draw->nv30 = nv30;
+	nv30draw->draw.draw = nv30->draw;
+	nv30draw->draw.point = nv30_draw_point;
+	nv30draw->draw.line = nv30_draw_line;
+	nv30draw->draw.tri = nv30_draw_tri;
+	nv30draw->draw.flush = nv30_draw_flush;
+	nv30draw->draw.reset_stipple_counter = nv30_draw_reset_stipple_counter;
+	nv30draw->draw.destroy = nv30_draw_destroy;
+
+	return &nv30draw->draw;
+}
+
diff --git a/src/gallium/drivers/nv30/nv30_fragprog.c b/src/gallium/drivers/nv30/nv30_fragprog.c
new file mode 100644
index 00000000000..320ba3f4bf4
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_fragprog.c
@@ -0,0 +1,911 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv30_context.h"
+
+#define SWZ_X 0
+#define SWZ_Y 1
+#define SWZ_Z 2
+#define SWZ_W 3
+#define MASK_X 1
+#define MASK_Y 2
+#define MASK_Z 4
+#define MASK_W 8
+#define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
+#define DEF_SCALE NV30_FP_OP_DST_SCALE_1X
+#define DEF_CTEST NV30_FP_OP_COND_TR
+#include "nv30_shader.h"
+
+#define swz(s,x,y,z,w) nv30_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
+#define neg(s) nv30_sr_neg((s))
+#define abs(s) nv30_sr_abs((s))
+#define scale(s,v) nv30_sr_scale((s), NV30_FP_OP_DST_SCALE_##v)
+
+#define MAX_CONSTS 128
+#define MAX_IMM 32
+struct nv30_fpc {
+	struct nv30_fragment_program *fp;
+
+	uint attrib_map[PIPE_MAX_SHADER_INPUTS];
+
+	int high_temp;
+	int temp_temp_count;
+	int num_regs;
+
+	uint depth_id;
+	uint colour_id;
+
+	unsigned inst_offset;
+
+	struct {
+		int pipe;
+		float vals[4];
+	} consts[MAX_CONSTS];
+	int nr_consts;
+
+	struct nv30_sreg imm[MAX_IMM];
+	unsigned nr_imm;
+};
+
+static INLINE struct nv30_sreg
+temp(struct nv30_fpc *fpc)
+{
+	int idx;
+
+	idx  = fpc->temp_temp_count++;
+	idx += fpc->high_temp + 1;
+	return nv30_sr(NV30SR_TEMP, idx);
+}
+
+static INLINE struct nv30_sreg
+constant(struct nv30_fpc *fpc, int pipe, float vals[4])
+{
+	int idx;
+
+	if (fpc->nr_consts == MAX_CONSTS)
+		assert(0);
+	idx = fpc->nr_consts++;
+
+	fpc->consts[idx].pipe = pipe;
+	if (pipe == -1)
+		memcpy(fpc->consts[idx].vals, vals, 4 * sizeof(float));
+	return nv30_sr(NV30SR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nv30_fp_arith((cc), (s), NV30_FP_OP_OPCODE_##o, \
+			(d), (m), (s0), (s1), (s2))
+#define tex(cc,s,o,u,d,m,s0,s1,s2) \
+	nv30_fp_tex((cc), (s), NV30_FP_OP_OPCODE_##o, (u), \
+		    (d), (m), (s0), none, none)
+
+static void
+grow_insns(struct nv30_fpc *fpc, int size)
+{
+	struct nv30_fragment_program *fp = fpc->fp;
+
+	fp->insn_len += size;
+	fp->insn = realloc(fp->insn, sizeof(uint32_t) * fp->insn_len);
+}
+
+static void
+emit_src(struct nv30_fpc *fpc, int pos, struct nv30_sreg src)
+{
+	struct nv30_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fpc->inst_offset];
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NV30SR_INPUT:
+		sr |= (NV30_FP_REG_TYPE_INPUT << NV30_FP_REG_TYPE_SHIFT);
+		hw[0] |= (src.index << NV30_FP_OP_INPUT_SRC_SHIFT);
+		break;
+	case NV30SR_OUTPUT:
+		sr |= NV30_FP_REG_SRC_HALF;
+		/* fall-through */
+	case NV30SR_TEMP:
+		sr |= (NV30_FP_REG_TYPE_TEMP << NV30_FP_REG_TYPE_SHIFT);
+		sr |= (src.index << NV30_FP_REG_SRC_SHIFT);
+		break;
+	case NV30SR_CONST:
+		grow_insns(fpc, 4);
+		hw = &fp->insn[fpc->inst_offset];
+		if (fpc->consts[src.index].pipe >= 0) {
+			struct nv30_fragment_program_data *fpd;
+
+			fp->consts = realloc(fp->consts, ++fp->nr_consts *
+					     sizeof(*fpd));
+			fpd = &fp->consts[fp->nr_consts - 1];
+			fpd->offset = fpc->inst_offset + 4;
+			fpd->index = fpc->consts[src.index].pipe;
+			memset(&fp->insn[fpd->offset], 0, sizeof(uint32_t) * 4);
+		} else {
+			memcpy(&fp->insn[fpc->inst_offset + 4],
+				fpc->consts[src.index].vals,
+				sizeof(uint32_t) * 4);
+		}
+
+		sr |= (NV30_FP_REG_TYPE_CONST << NV30_FP_REG_TYPE_SHIFT);	
+		break;
+	case NV30SR_NONE:
+		sr |= (NV30_FP_REG_TYPE_INPUT << NV30_FP_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NV30_FP_REG_NEGATE;
+
+	if (src.abs)
+		hw[1] |= (1 << (29 + pos));
+
+	sr |= ((src.swz[0] << NV30_FP_REG_SWZ_X_SHIFT) |
+	       (src.swz[1] << NV30_FP_REG_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NV30_FP_REG_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NV30_FP_REG_SWZ_W_SHIFT));
+
+	hw[pos + 1] |= sr;
+}
+
+static void
+emit_dst(struct nv30_fpc *fpc, struct nv30_sreg dst)
+{
+	struct nv30_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fpc->inst_offset];
+
+	switch (dst.type) {
+	case NV30SR_TEMP:
+		if (fpc->num_regs < (dst.index + 1))
+			fpc->num_regs = dst.index + 1;
+		break;
+	case NV30SR_OUTPUT:
+		if (dst.index == 1) {
+			fp->fp_control |= 0xe;
+		} else {
+			hw[0] |= NV30_FP_OP_OUT_REG_HALF;
+		}
+		break;
+	case NV30SR_NONE:
+		hw[0] |= (1 << 30);
+		break;
+	default:
+		assert(0);
+	}
+
+	hw[0] |= (dst.index << NV30_FP_OP_OUT_REG_SHIFT);
+}
+
+static void
+nv30_fp_arith(struct nv30_fpc *fpc, int sat, int op,
+	      struct nv30_sreg dst, int mask,
+	      struct nv30_sreg s0, struct nv30_sreg s1, struct nv30_sreg s2)
+{
+	struct nv30_fragment_program *fp = fpc->fp;
+	uint32_t *hw;
+
+	fpc->inst_offset = fp->insn_len;
+	grow_insns(fpc, 4);
+	hw = &fp->insn[fpc->inst_offset];
+	memset(hw, 0, sizeof(uint32_t) * 4);
+
+	if (op == NV30_FP_OP_OPCODE_KIL)
+		fp->fp_control |= NV34TCL_FP_CONTROL_USES_KIL;
+	hw[0] |= (op << NV30_FP_OP_OPCODE_SHIFT);
+	hw[0] |= (mask << NV30_FP_OP_OUTMASK_SHIFT);
+	hw[2] |= (dst.dst_scale << NV30_FP_OP_DST_SCALE_SHIFT);
+
+	if (sat)
+		hw[0] |= NV30_FP_OP_OUT_SAT;
+
+	if (dst.cc_update)
+		hw[0] |= NV30_FP_OP_COND_WRITE_ENABLE;
+	hw[1] |= (dst.cc_test << NV30_FP_OP_COND_SHIFT);
+	hw[1] |= ((dst.cc_swz[0] << NV30_FP_OP_COND_SWZ_X_SHIFT) |
+		  (dst.cc_swz[1] << NV30_FP_OP_COND_SWZ_Y_SHIFT) |
+		  (dst.cc_swz[2] << NV30_FP_OP_COND_SWZ_Z_SHIFT) |
+		  (dst.cc_swz[3] << NV30_FP_OP_COND_SWZ_W_SHIFT));
+
+	emit_dst(fpc, dst);
+	emit_src(fpc, 0, s0);
+	emit_src(fpc, 1, s1);
+	emit_src(fpc, 2, s2);
+}
+
+static void
+nv30_fp_tex(struct nv30_fpc *fpc, int sat, int op, int unit,
+	    struct nv30_sreg dst, int mask,
+	    struct nv30_sreg s0, struct nv30_sreg s1, struct nv30_sreg s2)
+{
+	struct nv30_fragment_program *fp = fpc->fp;
+
+	nv30_fp_arith(fpc, sat, op, dst, mask, s0, s1, s2);
+
+	fp->insn[fpc->inst_offset] |= (unit << NV30_FP_OP_TEX_UNIT_SHIFT);
+	fp->samplers |= (1 << unit);
+}
+
+static INLINE struct nv30_sreg
+tgsi_src(struct nv30_fpc *fpc, const struct tgsi_full_src_register *fsrc)
+{
+	struct nv30_sreg src;
+
+	switch (fsrc->SrcRegister.File) {
+	case TGSI_FILE_INPUT:
+		src = nv30_sr(NV30SR_INPUT,
+			      fpc->attrib_map[fsrc->SrcRegister.Index]);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(fpc, fsrc->SrcRegister.Index, NULL);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		assert(fsrc->SrcRegister.Index < fpc->nr_imm);
+		src = fpc->imm[fsrc->SrcRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		src = nv30_sr(NV30SR_TEMP, fsrc->SrcRegister.Index + 1);
+		if (fpc->high_temp < src.index)
+			fpc->high_temp = src.index;
+		break;
+	/* This is clearly insane, but gallium hands us shaders like this.
+	 * Luckily fragprog results are just temp regs..
+	 */
+	case TGSI_FILE_OUTPUT:
+		if (fsrc->SrcRegister.Index == fpc->colour_id)
+			return nv30_sr(NV30SR_OUTPUT, 0);
+		else
+			return nv30_sr(NV30SR_OUTPUT, 1);
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->SrcRegisterExtMod.Absolute;
+	src.negate = fsrc->SrcRegister.Negate;
+	src.swz[0] = fsrc->SrcRegister.SwizzleX;
+	src.swz[1] = fsrc->SrcRegister.SwizzleY;
+	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
+	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	return src;
+}
+
+static INLINE struct nv30_sreg
+tgsi_dst(struct nv30_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
+	int idx;
+
+	switch (fdst->DstRegister.File) {
+	case TGSI_FILE_OUTPUT:
+		if (fdst->DstRegister.Index == fpc->colour_id)
+			return nv30_sr(NV30SR_OUTPUT, 0);
+		else
+			return nv30_sr(NV30SR_OUTPUT, 1);
+		break;
+	case TGSI_FILE_TEMPORARY:
+		idx = fdst->DstRegister.Index + 1;
+		if (fpc->high_temp < idx)
+			fpc->high_temp = idx;
+		return nv30_sr(NV30SR_TEMP, idx);
+	case TGSI_FILE_NULL:
+		return nv30_sr(NV30SR_NONE, 0);
+	default:
+		NOUVEAU_ERR("bad dst file %d\n", fdst->DstRegister.File);
+		return nv30_sr(NV30SR_NONE, 0);
+	}
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
+	return mask;
+}
+
+static boolean
+src_native_swz(struct nv30_fpc *fpc, const struct tgsi_full_src_register *fsrc,
+	       struct nv30_sreg *src)
+{
+	const struct nv30_sreg none = nv30_sr(NV30SR_NONE, 0);
+	struct nv30_sreg tgsi = tgsi_src(fpc, fsrc);
+	uint mask = 0, zero_mask = 0, one_mask = 0, neg_mask = 0;
+	uint neg[4] = { fsrc->SrcRegisterExtSwz.NegateX,
+			fsrc->SrcRegisterExtSwz.NegateY,
+			fsrc->SrcRegisterExtSwz.NegateZ,
+			fsrc->SrcRegisterExtSwz.NegateW };
+	uint c;
+
+	for (c = 0; c < 4; c++) {
+		switch (tgsi_util_get_full_src_register_extswizzle(fsrc, c)) {
+		case TGSI_EXTSWIZZLE_X:
+		case TGSI_EXTSWIZZLE_Y:
+		case TGSI_EXTSWIZZLE_Z:
+		case TGSI_EXTSWIZZLE_W:
+			mask |= (1 << c);
+			break;
+		case TGSI_EXTSWIZZLE_ZERO:
+			zero_mask |= (1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		case TGSI_EXTSWIZZLE_ONE:
+			one_mask |= (1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		default:
+			assert(0);
+		}
+
+		if (!tgsi.negate && neg[c])
+			neg_mask |= (1 << c);
+	}
+
+	if (mask == MASK_ALL && !neg_mask)
+		return TRUE;
+
+	*src = temp(fpc);
+
+	if (mask)
+		arith(fpc, 0, MOV, *src, mask, tgsi, none, none);
+
+	if (zero_mask)
+		arith(fpc, 0, SFL, *src, zero_mask, *src, none, none);
+
+	if (one_mask)
+		arith(fpc, 0, STR, *src, one_mask, *src, none, none);
+
+	if (neg_mask) {
+		struct nv30_sreg one = temp(fpc);
+		arith(fpc, 0, STR, one, neg_mask, one, none, none);
+		arith(fpc, 0, MUL, *src, neg_mask, *src, neg(one), none);
+	}
+
+	return FALSE;
+}
+
+static boolean
+nv30_fragprog_parse_instruction(struct nv30_fpc *fpc,
+				const struct tgsi_full_instruction *finst)
+{
+	const struct nv30_sreg none = nv30_sr(NV30SR_NONE, 0);
+	struct nv30_sreg src[3], dst, tmp;
+	int mask, sat, unit = 0;
+	int ai = -1, ci = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	fpc->temp_temp_count = 0;
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(fpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_TEMPORARY:
+			if (!src_native_swz(fpc, fsrc, &src[i]))
+				continue;
+			break;
+		default:
+			break;
+		}
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
+				ai = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				NOUVEAU_MSG("extra src attr %d\n",
+					 fsrc->SrcRegister.Index);
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_IMMEDIATE:
+			if (ci == -1 || ci == fsrc->SrcRegister.Index) {
+				ci = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		case TGSI_FILE_SAMPLER:
+			unit = fsrc->SrcRegister.Index;
+			break;
+		case TGSI_FILE_OUTPUT:
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(fpc, &finst->FullDstRegisters[0]);
+	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+	sat  = (finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(fpc, sat, MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(fpc, sat, ADD, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_CMP:
+		tmp = temp(fpc);
+		arith(fpc, sat, MOV, dst, mask, src[2], none, none);
+		tmp.cc_update = 1;
+		arith(fpc, 0, MOV, tmp, 0xf, src[0], none, none);
+		dst.cc_test = NV30_VP_INST_COND_LT;
+		arith(fpc, sat, MOV, dst, mask, src[1], none, none);
+		break;
+	case TGSI_OPCODE_COS:
+		arith(fpc, sat, COS, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(fpc, sat, DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(fpc, sat, DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		tmp = temp(fpc);
+		arith(fpc, 0, DP3, tmp, MASK_X, src[0], src[1], none);
+		arith(fpc, sat, ADD, dst, mask, swz(tmp, X, X, X, X),
+		      swz(src[1], W, W, W, W), none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(fpc, sat, DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(fpc, sat, EX2, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(fpc, sat, FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(fpc, sat, FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_KILP:
+		arith(fpc, 0, KIL, none, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_KIL:
+		dst = nv30_sr(NV30SR_NONE, 0);
+		dst.cc_update = 1;
+		arith(fpc, 0, MOV, dst, MASK_ALL, src[0], none, none);
+		dst.cc_update = 0; dst.cc_test = NV30_FP_OP_COND_LT;
+		arith(fpc, 0, KIL, dst, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(fpc, sat, LG2, dst, mask, src[0], none, none);
+		break;
+//	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_LRP:
+		arith(fpc, sat, LRP, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(fpc, sat, MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(fpc, sat, MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(fpc, sat, MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(fpc, sat, MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(fpc, sat, MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_NOISE1:
+	case TGSI_OPCODE_NOISE2:
+	case TGSI_OPCODE_NOISE3:
+	case TGSI_OPCODE_NOISE4:
+		arith(fpc, sat, SFL, dst, mask, none, none, none);
+		break;
+	case TGSI_OPCODE_POW:
+		arith(fpc, sat, POW, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(fpc, sat, RCP, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_RET:
+		assert(0);
+		break;
+	case TGSI_OPCODE_RFL:
+		arith(fpc, 0, RFL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_RSQ:
+		arith(fpc, sat, RSQ, dst, mask, abs(swz(src[0], X, X, X, X)), none, none);
+		break;
+	case TGSI_OPCODE_SCS:
+		if (mask & MASK_X) {
+			arith(fpc, sat, COS, dst, MASK_X,
+			      swz(src[0], X, X, X, X), none, none);
+		}
+		if (mask & MASK_Y) {
+			arith(fpc, sat, SIN, dst, MASK_Y,
+			      swz(src[0], X, X, X, X), none, none);
+		}
+		break;
+	case TGSI_OPCODE_SIN:
+		arith(fpc, sat, SIN, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(fpc, sat, SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGT:
+		arith(fpc, sat, SGT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(fpc, sat, SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(fpc, sat, ADD, dst, mask, src[0], neg(src[1]), none);
+		break;
+	case TGSI_OPCODE_TEX:
+		tex(fpc, sat, TEX, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_TXB:
+		tex(fpc, sat, TXB, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_TXP:
+		tex(fpc, sat, TXP, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(fpc);
+		arith(fpc, 0, MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(fpc, sat, MAD, dst, (mask & ~MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv30_fragprog_parse_decl_attrib(struct nv30_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	int hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NV30_FP_OP_INPUT_SRC_POSITION;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV30_FP_OP_INPUT_SRC_COL0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV30_FP_OP_INPUT_SRC_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NV30_FP_OP_INPUT_SRC_FOGC;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.SemanticIndex <= 7) {
+			hw = NV30_FP_OP_INPUT_SRC_TC(fdec->Semantic.
+						     SemanticIndex);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad input semantic\n");
+		return FALSE;
+	}
+
+	fpc->attrib_map[fdec->DeclarationRange.First] = hw;
+	return TRUE;
+}
+
+static boolean
+nv30_fragprog_parse_decl_output(struct nv30_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		fpc->depth_id = fdec->DeclarationRange.First;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		fpc->colour_id = fdec->DeclarationRange.First;
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv30_fragprog_prepare(struct nv30_fpc *fpc)
+{
+	struct tgsi_parse_context p;
+	/*int high_temp = -1, i;*/
+
+	tgsi_parse_init(&p, fpc->fp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+			fdec = &p.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_INPUT:
+				if (!nv30_fragprog_parse_decl_attrib(fpc, fdec))
+					goto out_err;
+				break;
+			case TGSI_FILE_OUTPUT:
+				if (!nv30_fragprog_parse_decl_output(fpc, fdec))
+					goto out_err;
+				break;
+			/*case TGSI_FILE_TEMPORARY:
+				if (fdec->DeclarationRange.Last > high_temp) {
+					high_temp =
+						fdec->DeclarationRange.Last;
+				}
+				break;*/
+			default:
+				break;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			struct tgsi_full_immediate *imm;
+			float vals[4];
+			
+			imm = &p.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+			assert(fpc->nr_imm < MAX_IMM);
+
+			vals[0] = imm->u.ImmediateFloat32[0].Float;
+			vals[1] = imm->u.ImmediateFloat32[1].Float;
+			vals[2] = imm->u.ImmediateFloat32[2].Float;
+			vals[3] = imm->u.ImmediateFloat32[3].Float;
+			fpc->imm[fpc->nr_imm++] = constant(fpc, -1, vals);
+		}
+			break;
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	/*if (++high_temp) {
+		fpc->r_temp = CALLOC(high_temp, sizeof(struct nv30_sreg));
+		for (i = 0; i < high_temp; i++)
+			fpc->r_temp[i] = temp(fpc);
+		fpc->r_temps_discard = 0;
+	}*/
+
+	return TRUE;
+
+out_err:
+	/*if (fpc->r_temp)
+		FREE(fpc->r_temp);*/
+	tgsi_parse_free(&p);
+	return FALSE;
+}
+
+static void
+nv30_fragprog_translate(struct nv30_context *nv30,
+			struct nv30_fragment_program *fp)
+{
+	struct tgsi_parse_context parse;
+	struct nv30_fpc *fpc = NULL;
+
+	tgsi_dump(fp->pipe.tokens,0);
+
+	fpc = CALLOC(1, sizeof(struct nv30_fpc));
+	if (!fpc)
+		return;
+	fpc->fp = fp;
+	fpc->high_temp = -1;
+	fpc->num_regs = 2;
+
+	if (!nv30_fragprog_prepare(fpc)) {
+		FREE(fpc);
+		return;
+	}
+
+	tgsi_parse_init(&parse, fp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+
+			finst = &parse.FullToken.FullInstruction;
+			if (!nv30_fragprog_parse_instruction(fpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	fp->fp_control |= (fpc->num_regs-1)/2;
+	fp->fp_reg_control = (1<<16)|0x4;
+
+	/* Terminate final instruction */
+	fp->insn[fpc->inst_offset] |= 0x00000001;
+
+	/* Append NOP + END instruction, may or may not be necessary. */
+	fpc->inst_offset = fp->insn_len;
+	grow_insns(fpc, 4);
+	fp->insn[fpc->inst_offset + 0] = 0x00000001;
+	fp->insn[fpc->inst_offset + 1] = 0x00000000;
+	fp->insn[fpc->inst_offset + 2] = 0x00000000;
+	fp->insn[fpc->inst_offset + 3] = 0x00000000;
+	
+	fp->translated = TRUE;
+	fp->on_hw = FALSE;
+out_err:
+	tgsi_parse_free(&parse);
+	FREE(fpc);
+}
+
+static void
+nv30_fragprog_upload(struct nv30_context *nv30,
+		     struct nv30_fragment_program *fp)
+{
+	struct pipe_winsys *ws = nv30->pipe.winsys;
+	const uint32_t le = 1;
+	uint32_t *map;
+	int i;
+
+	map = ws->buffer_map(ws, fp->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
+
+#if 0
+	for (i = 0; i < fp->insn_len; i++) {
+		fflush(stdout); fflush(stderr);
+		NOUVEAU_ERR("%d 0x%08x\n", i, fp->insn[i]);
+		fflush(stdout); fflush(stderr);
+	}
+#endif
+
+	if ((*(const uint8_t *)&le)) {
+		for (i = 0; i < fp->insn_len; i++) {
+			map[i] = fp->insn[i];
+		}
+	} else {
+		/* Weird swapping for big-endian chips */
+		for (i = 0; i < fp->insn_len; i++) {
+			map[i] = ((fp->insn[i] & 0xffff) << 16) |
+				  ((fp->insn[i] >> 16) & 0xffff);
+		}
+	}
+
+	ws->buffer_unmap(ws, fp->buffer);
+}
+
+static boolean
+nv30_fragprog_validate(struct nv30_context *nv30)
+{
+	struct nv30_fragment_program *fp = nv30->fragprog;
+	struct pipe_buffer *constbuf =
+		nv30->constbuf[PIPE_SHADER_FRAGMENT];
+	struct pipe_winsys *ws = nv30->pipe.winsys;
+	struct nouveau_stateobj *so;
+	boolean new_consts = FALSE;
+	int i;
+
+	if (fp->translated)
+		goto update_constants;
+
+	/*nv30->fallback_swrast &= ~NV30_NEW_FRAGPROG;*/
+	nv30_fragprog_translate(nv30, fp);
+	if (!fp->translated) {
+		/*nv30->fallback_swrast |= NV30_NEW_FRAGPROG;*/
+		return FALSE;
+	}
+
+	fp->buffer = ws->buffer_create(ws, 0x100, 0, fp->insn_len * 4);
+	nv30_fragprog_upload(nv30, fp);
+
+	so = so_new(8, 1);
+	so_method(so, nv30->screen->rankine, NV34TCL_FP_ACTIVE_PROGRAM, 1);
+	so_reloc (so, fp->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+		  NV34TCL_FP_ACTIVE_PROGRAM_DMA0, NV34TCL_FP_ACTIVE_PROGRAM_DMA1);
+	so_method(so, nv30->screen->rankine, NV34TCL_FP_CONTROL, 1);
+	so_data  (so, fp->fp_control);
+	so_method(so, nv30->screen->rankine, NV34TCL_FP_REG_CONTROL, 1);
+	so_data  (so, fp->fp_reg_control);
+	so_method(so, nv30->screen->rankine, NV34TCL_TX_UNITS_ENABLE, 1);
+	so_data  (so, fp->samplers);
+	so_ref(so, &fp->so);
+
+update_constants:
+	if (fp->nr_consts) {
+		float *map;
+		
+		map = ws->buffer_map(ws, constbuf, PIPE_BUFFER_USAGE_CPU_READ);
+		for (i = 0; i < fp->nr_consts; i++) {
+			struct nv30_fragment_program_data *fpd = &fp->consts[i];
+			uint32_t *p = &fp->insn[fpd->offset];
+			uint32_t *cb = (uint32_t *)&map[fpd->index * 4];
+
+			if (!memcmp(p, cb, 4 * sizeof(float)))
+				continue;
+			memcpy(p, cb, 4 * sizeof(float));
+			new_consts = TRUE;
+		}
+		ws->buffer_unmap(ws, constbuf);
+
+		if (new_consts)
+			nv30_fragprog_upload(nv30, fp);
+	}
+
+	if (new_consts || fp->so != nv30->state.hw[NV30_STATE_FRAGPROG]) {
+		so_ref(fp->so, &nv30->state.hw[NV30_STATE_FRAGPROG]);
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+void
+nv30_fragprog_destroy(struct nv30_context *nv30,
+		      struct nv30_fragment_program *fp)
+{
+	if (fp->insn_len)
+		FREE(fp->insn);
+}
+
+struct nv30_state_entry nv30_state_fragprog = {
+	.validate = nv30_fragprog_validate,
+	.dirty = {
+		.pipe = NV30_NEW_FRAGPROG,
+		.hw = NV30_STATE_FRAGPROG
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_fragtex.c b/src/gallium/drivers/nv30/nv30_fragtex.c
new file mode 100644
index 00000000000..9e6f746a420
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_fragtex.c
@@ -0,0 +1,186 @@
+#include "nv30_context.h"
+
+static INLINE int log2i(int i)
+{
+	int r = 0;
+
+	if (i & 0xffff0000) {
+		i >>= 16;
+		r += 16;
+	}
+	if (i & 0x0000ff00) {
+		i >>= 8;
+		r += 8;
+	}
+	if (i & 0x000000f0) {
+		i >>= 4;
+		r += 4;
+	}
+	if (i & 0x0000000c) {
+		i >>= 2;
+		r += 2;
+	}
+	if (i & 0x00000002) {
+		r += 1;
+	}
+	return r;
+}
+
+#define _(m,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w,swsurf)                 \
+{                                                                              \
+  TRUE,                                                                        \
+  PIPE_FORMAT_##m,                                                             \
+  NV34TCL_TX_FORMAT_FORMAT_##tf,                                               \
+  (NV34TCL_TX_SWIZZLE_S0_X_##ts0x | NV34TCL_TX_SWIZZLE_S0_Y_##ts0y |           \
+   NV34TCL_TX_SWIZZLE_S0_Z_##ts0z | NV34TCL_TX_SWIZZLE_S0_W_##ts0w |           \
+   NV34TCL_TX_SWIZZLE_S1_X_##ts1x | NV34TCL_TX_SWIZZLE_S1_Y_##ts1y |           \
+   NV34TCL_TX_SWIZZLE_S1_Z_##ts1z | NV34TCL_TX_SWIZZLE_S1_W_##ts1w),           \
+  swsurf                                                                       \
+}
+
+struct nv30_texture_format {
+	boolean defined;
+	uint	pipe;
+	int     format;
+	int     swizzle;
+	int	swizzled_surface;
+};
+
+static struct nv30_texture_format
+nv30_texture_formats[] = {
+	_(A8R8G8B8_UNORM, A8R8G8B8,   S1,   S1,   S1,   S1, X, Y, Z, W, 1),
+	_(A1R5G5B5_UNORM, A1R5G5B5,   S1,   S1,   S1,   S1, X, Y, Z, W, 1),
+	_(A4R4G4B4_UNORM, A4R4G4B4,   S1,   S1,   S1,   S1, X, Y, Z, W, 1),
+	_(R5G6B5_UNORM  , R5G6B5  ,   S1,   S1,   S1,  ONE, X, Y, Z, W, 1),
+	_(L8_UNORM      , L8      ,   S1,   S1,   S1,  ONE, X, X, X, X, 1),
+	_(A8_UNORM      , L8      , ZERO, ZERO, ZERO,   S1, X, X, X, X, 1),
+	_(I8_UNORM      , L8      ,   S1,   S1,   S1,   S1, X, X, X, X, 1),
+	_(A8L8_UNORM    , A8L8    ,   S1,   S1,   S1,   S1, X, X, X, Y, 1),
+//	_(Z16_UNORM     , Z16     ,   S1,   S1,   S1,  ONE, X, X, X, X, 0),
+//	_(Z24S8_UNORM   , Z24     ,   S1,   S1,   S1,  ONE, X, X, X, X, 0),
+	_(DXT1_RGB      , DXT1    ,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0),
+	_(DXT1_RGBA     , DXT1    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0),
+	_(DXT3_RGBA     , DXT3    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0),
+	_(DXT5_RGBA     , DXT5    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0),
+	{},
+};
+
+static struct nv30_texture_format *
+nv30_fragtex_format(uint pipe_format)
+{
+	struct nv30_texture_format *tf = nv30_texture_formats;
+	char fs[128];
+
+	while (tf->defined) {
+		if (tf->pipe == pipe_format)
+			return tf;
+		tf++;
+	}
+
+	NOUVEAU_ERR("unknown texture format %s\n", pf_name(pipe_format));
+	return NULL;
+}
+
+
+static struct nouveau_stateobj *
+nv30_fragtex_build(struct nv30_context *nv30, int unit)
+{
+	struct nv30_sampler_state *ps = nv30->tex_sampler[unit];
+	struct nv30_miptree *nv30mt = nv30->tex_miptree[unit];
+	struct pipe_texture *pt = &nv30mt->base;
+	struct nv30_texture_format *tf;
+	struct nouveau_stateobj *so;
+	uint32_t txf, txs /*, txp*/;
+	/*int swizzled = 0;*/ /*XXX: implement in region code? */
+	unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+
+	tf = nv30_fragtex_format(pt->format);
+	if (!tf)
+		assert(0);
+
+	tex_flags |= (tf->swizzled_surface ? NOUVEAU_BO_SWIZZLED : 0);
+
+	txf  = tf->format;
+	txf |= ((pt->last_level>0) ? NV34TCL_TX_FORMAT_MIPMAP : 0);
+	txf |= log2i(pt->width[0]) << 20;
+	txf |= log2i(pt->height[0]) << 24;
+	txf |= log2i(pt->depth[0]) << 28;
+	txf |= NV34TCL_TX_FORMAT_NO_BORDER | 0x10000;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_CUBE:
+		txf |= NV34TCL_TX_FORMAT_CUBIC;
+		/* fall-through */
+	case PIPE_TEXTURE_2D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_2D;
+		break;
+	case PIPE_TEXTURE_3D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_3D;
+		break;
+	case PIPE_TEXTURE_1D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_1D;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return NULL;
+	}
+
+	txs = tf->swizzle;
+
+	so = so_new(16, 2);
+	so_method(so, nv30->screen->rankine, NV34TCL_TX_OFFSET(unit), 8);
+	so_reloc (so, nv30mt->buffer, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
+	so_reloc (so, nv30mt->buffer, txf, tex_flags | NOUVEAU_BO_OR,
+		  NV34TCL_TX_FORMAT_DMA0, NV34TCL_TX_FORMAT_DMA1);
+	so_data  (so, ps->wrap);
+	so_data  (so, NV34TCL_TX_ENABLE_ENABLE | ps->en);
+	so_data  (so, txs);
+	so_data  (so, ps->filt | 0x2000 /*voodoo*/);
+	so_data  (so, (pt->width[0] << NV34TCL_TX_NPOT_SIZE_W_SHIFT) |
+		       pt->height[0]);
+	so_data  (so, ps->bcol);
+
+	return so;
+}
+
+static boolean
+nv30_fragtex_validate(struct nv30_context *nv30)
+{
+	struct nv30_fragment_program *fp = nv30->fragprog;
+	struct nv30_state *state = &nv30->state;
+	struct nouveau_stateobj *so;
+	unsigned samplers, unit;
+
+	samplers = state->fp_samplers & ~fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		so = so_new(2, 0);
+		so_method(so, nv30->screen->rankine, NV34TCL_TX_ENABLE(unit), 1);
+		so_data  (so, 0);
+		so_ref(so, &nv30->state.hw[NV30_STATE_FRAGTEX0 + unit]);
+		state->dirty |= (1ULL << (NV30_STATE_FRAGTEX0 + unit));
+	}
+
+	samplers = nv30->dirty_samplers & fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		so = nv30_fragtex_build(nv30, unit);
+		so_ref(so, &nv30->state.hw[NV30_STATE_FRAGTEX0 + unit]);
+		state->dirty |= (1ULL << (NV30_STATE_FRAGTEX0 + unit));
+	}
+
+	nv30->state.fp_samplers = fp->samplers;
+	return FALSE;
+}
+
+struct nv30_state_entry nv30_state_fragtex = {
+	.validate = nv30_fragtex_validate,
+	.dirty = {
+		.pipe = NV30_NEW_SAMPLER | NV30_NEW_FRAGPROG,
+		.hw = 0
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_miptree.c b/src/gallium/drivers/nv30/nv30_miptree.c
new file mode 100644
index 00000000000..9124db03e83
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_miptree.c
@@ -0,0 +1,162 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "nv30_context.h"
+
+static void
+nv30_miptree_layout(struct nv30_miptree *nv30mt)
+{
+	struct pipe_texture *pt = &nv30mt->base;
+	boolean swizzled = FALSE;
+	uint width = pt->width[0], height = pt->height[0], depth = pt->depth[0];
+	uint offset = 0;
+	int nr_faces, l, f, pitch;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		nr_faces = 6;
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		nr_faces = pt->depth[0];
+	} else {
+		nr_faces = 1;
+	}
+	
+	pitch = pt->width[0];
+	for (l = 0; l <= pt->last_level; l++) {
+		pt->width[l] = width;
+		pt->height[l] = height;
+		pt->depth[l] = depth;
+		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
+		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
+
+		if (swizzled)
+			pitch = pt->nblocksx[l];
+		pitch = align(pitch, 64);
+
+		nv30mt->level[l].pitch = pitch * pt->block.size;
+		nv30mt->level[l].image_offset =
+			CALLOC(nr_faces, sizeof(unsigned));
+
+		width  = MAX2(1, width  >> 1);
+		height = MAX2(1, height >> 1);
+		depth  = MAX2(1, depth  >> 1);
+	}
+
+	for (f = 0; f < nr_faces; f++) {
+		for (l = 0; l <= pt->last_level; l++) {
+			nv30mt->level[l].image_offset[f] = offset;
+			offset += nv30mt->level[l].pitch * pt->height[l];
+		}
+	}
+
+	nv30mt->total_size = offset;
+}
+
+static struct pipe_texture *
+nv30_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
+{
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv30_miptree *mt;
+
+	mt = MALLOC(sizeof(struct nv30_miptree));
+	if (!mt)
+		return NULL;
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+
+	nv30_miptree_layout(mt);
+
+	mt->buffer = ws->buffer_create(ws, 256,
+				       PIPE_BUFFER_USAGE_PIXEL |
+				       NOUVEAU_BUFFER_USAGE_TEXTURE,
+				       mt->total_size);
+	if (!mt->buffer) {
+		FREE(mt);
+		return NULL;
+	}
+
+	return &mt->base;
+}
+
+static void
+nv30_miptree_release(struct pipe_screen *pscreen, struct pipe_texture **pt)
+{
+	struct pipe_texture *mt = *pt;
+
+	*pt = NULL;
+	if (--mt->refcount <= 0) {
+		struct nv30_miptree *nv30mt = (struct nv30_miptree *)mt;
+		int l;
+
+		pipe_buffer_reference(pscreen, &nv30mt->buffer, NULL);
+		for (l = 0; l <= mt->last_level; l++) {
+			if (nv30mt->level[l].image_offset)
+				FREE(nv30mt->level[l].image_offset);
+		}
+		FREE(nv30mt);
+	}
+}
+
+static struct pipe_surface *
+nv30_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct nv30_miptree *nv30mt = (struct nv30_miptree *)pt;
+	struct pipe_surface *ps;
+
+	ps = CALLOC_STRUCT(pipe_surface);
+	if (!ps)
+		return NULL;
+	pipe_texture_reference(&ps->texture, pt);
+	pipe_buffer_reference(pscreen, &ps->buffer, nv30mt->buffer);
+	ps->format = pt->format;
+	ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->block = pt->block;
+	ps->nblocksx = pt->nblocksx[level];
+	ps->nblocksy = pt->nblocksy[level];
+	ps->stride = nv30mt->level[level].pitch;
+	ps->usage = flags;
+	ps->status = PIPE_SURFACE_STATUS_DEFINED;
+	ps->refcount = 1;
+	ps->winsys = pscreen->winsys;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		ps->offset = nv30mt->level[level].image_offset[face];
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		ps->offset = nv30mt->level[level].image_offset[zslice];
+	} else {
+		ps->offset = nv30mt->level[level].image_offset[0];
+	}
+
+	return ps;
+}
+
+static void
+nv30_miptree_surface_del(struct pipe_screen *pscreen,
+			 struct pipe_surface **psurface)
+{
+	struct pipe_surface *ps = *psurface;
+
+	*psurface = NULL;
+	if (--ps->refcount > 0)
+		return;
+
+	pipe_texture_reference(&ps->texture, NULL);
+	pipe_buffer_reference(pscreen->winsys, &ps->buffer, NULL);
+	FREE(ps);
+}
+
+void
+nv30_screen_init_miptree_functions(struct pipe_screen *pscreen)
+{
+	pscreen->texture_create = nv30_miptree_create;
+	pscreen->texture_release = nv30_miptree_release;
+	pscreen->get_tex_surface = nv30_miptree_surface_new;
+	pscreen->tex_surface_release = nv30_miptree_surface_del;
+}
+
diff --git a/src/gallium/drivers/nv30/nv30_query.c b/src/gallium/drivers/nv30/nv30_query.c
new file mode 100644
index 00000000000..d40d75f2640
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_query.c
@@ -0,0 +1,122 @@
+#include "pipe/p_context.h"
+
+#include "nv30_context.h"
+
+struct nv30_query {
+	struct nouveau_resource *object;
+	unsigned type;
+	boolean ready;
+	uint64_t result;
+};
+
+static INLINE struct nv30_query *
+nv30_query(struct pipe_query *pipe)
+{
+	return (struct nv30_query *)pipe;
+}
+
+static struct pipe_query *
+nv30_query_create(struct pipe_context *pipe, unsigned query_type)
+{
+	struct nv30_query *q;
+
+	q = CALLOC(1, sizeof(struct nv30_query));
+	q->type = query_type;
+
+	return (struct pipe_query *)q;
+}
+
+static void
+nv30_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_query *q = nv30_query(pq);
+
+	if (q->object)
+		nv30->nvws->res_free(&q->object);
+	FREE(q);
+}
+
+static void
+nv30_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_query *q = nv30_query(pq);
+
+	assert(q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+	/* Happens when end_query() is called, then another begin_query()
+	 * without querying the result in-between.  For now we'll wait for
+	 * the existing query to notify completion, but it could be better.
+	 */
+	if (q->object) {
+		uint64 tmp;
+		pipe->get_query_result(pipe, pq, 1, &tmp);
+	}
+
+	if (nv30->nvws->res_alloc(nv30->screen->query_heap, 1, NULL, &q->object))
+		assert(0);
+	nv30->nvws->notifier_reset(nv30->screen->query, q->object->start);
+
+	BEGIN_RING(rankine, NV34TCL_QUERY_RESET, 1);
+	OUT_RING  (1);
+	BEGIN_RING(rankine, NV34TCL_QUERY_UNK17CC, 1);
+	OUT_RING  (1);
+
+	q->ready = FALSE;
+}
+
+static void
+nv30_query_end(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_query *q = nv30_query(pq);
+
+	BEGIN_RING(rankine, NV34TCL_QUERY_GET, 1);
+	OUT_RING  ((0x01 << NV34TCL_QUERY_GET_UNK24_SHIFT) |
+		   ((q->object->start * 32) << NV34TCL_QUERY_GET_OFFSET_SHIFT));
+	FIRE_RING(NULL);
+}
+
+static boolean
+nv30_query_result(struct pipe_context *pipe, struct pipe_query *pq,
+		  boolean wait, uint64 *result)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_query *q = nv30_query(pq);
+	struct nouveau_winsys *nvws = nv30->nvws;
+
+	assert(q->object && q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+	if (!q->ready) {
+		unsigned status;
+
+		status = nvws->notifier_status(nv30->screen->query,
+					       q->object->start);
+		if (status != NV_NOTIFY_STATE_STATUS_COMPLETED) {
+			if (wait == FALSE)
+				return FALSE;
+			nvws->notifier_wait(nv30->screen->query, q->object->start,
+					    NV_NOTIFY_STATE_STATUS_COMPLETED,
+					    0);
+		}
+
+		q->result = nvws->notifier_retval(nv30->screen->query,
+						  q->object->start);
+		q->ready = TRUE;
+		nvws->res_free(&q->object);
+	}
+
+	*result = q->result;
+	return TRUE;
+}
+
+void
+nv30_init_query_functions(struct nv30_context *nv30)
+{
+	nv30->pipe.create_query = nv30_query_create;
+	nv30->pipe.destroy_query = nv30_query_destroy;
+	nv30->pipe.begin_query = nv30_query_begin;
+	nv30->pipe.end_query = nv30_query_end;
+	nv30->pipe.get_query_result = nv30_query_result;
+}
diff --git a/src/gallium/drivers/nv30/nv30_screen.c b/src/gallium/drivers/nv30/nv30_screen.c
new file mode 100644
index 00000000000..a595e2eb225
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_screen.c
@@ -0,0 +1,332 @@
+#include "pipe/p_screen.h"
+
+#include "nv30_context.h"
+#include "nv30_screen.h"
+
+#define NV30TCL_CHIPSET_3X_MASK 0x00000003
+#define NV34TCL_CHIPSET_3X_MASK 0x00000010
+#define NV35TCL_CHIPSET_3X_MASK 0x000001e0
+
+static const char *
+nv30_screen_get_name(struct pipe_screen *pscreen)
+{
+	struct nv30_screen *screen = nv30_screen(pscreen);
+	struct nouveau_device *dev = screen->nvws->channel->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nv30_screen_get_vendor(struct pipe_screen *pscreen)
+{
+	return "nouveau";
+}
+
+static int
+nv30_screen_get_param(struct pipe_screen *pscreen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 16;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 0;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 1;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 0;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 1;
+	case PIPE_CAP_POINT_SPRITE:
+		return 1;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 2;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 1;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 1;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 13;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 10;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 13;
+	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+		return 0;
+	case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+		return 1;
+	case NOUVEAU_CAP_HW_VTXBUF:
+	case NOUVEAU_CAP_HW_IDXBUF:
+		return 1;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv30_screen_get_paramf(struct pipe_screen *pscreen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 10.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 64.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 8.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 4.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static boolean
+nv30_screen_surface_format_supported(struct pipe_screen *pscreen,
+				     enum pipe_format format,
+				     enum pipe_texture_target target,
+				     unsigned tex_usage, unsigned geom_flags)
+{
+	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_A1R5G5B5_UNORM:
+		case PIPE_FORMAT_A4R4G4B4_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+		case PIPE_FORMAT_A8L8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+		case PIPE_FORMAT_Z24S8_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static void *
+nv30_surface_map(struct pipe_screen *screen, struct pipe_surface *surface,
+		 unsigned flags )
+{
+	struct pipe_winsys *ws = screen->winsys;
+	void *map;
+
+	map = ws->buffer_map(ws, surface->buffer, flags);
+	if (!map)
+		return NULL;
+
+	return map + surface->offset;
+}
+
+static void
+nv30_surface_unmap(struct pipe_screen *screen, struct pipe_surface *surface)
+{
+	struct pipe_winsys *ws = screen->winsys;
+
+	ws->buffer_unmap(ws, surface->buffer);
+}
+
+static void
+nv30_screen_destroy(struct pipe_screen *pscreen)
+{
+	struct nv30_screen *screen = nv30_screen(pscreen);
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nvws->res_free(&screen->vp_exec_heap);
+	nvws->res_free(&screen->vp_data_heap);
+	nvws->res_free(&screen->query_heap);
+	nvws->notifier_free(&screen->query);
+	nvws->notifier_free(&screen->sync);
+	nvws->grobj_free(&screen->rankine);
+
+	FREE(pscreen);
+}
+
+struct pipe_screen *
+nv30_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+{
+	struct nv30_screen *screen = CALLOC_STRUCT(nv30_screen);
+	struct nouveau_stateobj *so;
+	unsigned rankine_class = 0;
+	unsigned chipset = nvws->channel->device->chipset;
+	int ret, i;
+
+	if (!screen)
+		return NULL;
+	screen->nvws = nvws;
+
+	/* 3D object */
+	switch (chipset & 0xf0) {
+	case 0x30:
+		if (NV30TCL_CHIPSET_3X_MASK & (1 << (chipset & 0x0f)))
+			rankine_class = 0x0397;
+		else
+		if (NV34TCL_CHIPSET_3X_MASK & (1 << (chipset & 0x0f)))
+			rankine_class = 0x0697;
+		else
+		if (NV35TCL_CHIPSET_3X_MASK & (1 << (chipset & 0x0f)))
+			rankine_class = 0x0497;
+		break;
+	default:
+		break;
+	}
+
+	if (!rankine_class) {
+		NOUVEAU_ERR("Unknown nv3x chipset: nv%02x\n", chipset);
+		return NULL;
+	}
+
+	ret = nvws->grobj_alloc(nvws, rankine_class, &screen->rankine);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return FALSE;
+	}
+
+	/* Notifier for sync purposes */
+	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv30_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Query objects */
+	ret = nvws->notifier_alloc(nvws, 32, &screen->query);
+	if (ret) {
+		NOUVEAU_ERR("Error initialising query objects: %d\n", ret);
+		nv30_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	ret = nvws->res_init(&screen->query_heap, 0, 32);
+	if (ret) {
+		NOUVEAU_ERR("Error initialising query object heap: %d\n", ret);
+		nv30_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Vtxprog resources */
+	if (nvws->res_init(&screen->vp_exec_heap, 0, 256) ||
+	    nvws->res_init(&screen->vp_data_heap, 0, 256)) {
+		nv30_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Static rankine initialisation */
+	so = so_new(128, 0);
+	so_method(so, screen->rankine, NV34TCL_DMA_NOTIFY, 1);
+	so_data  (so, screen->sync->handle);
+	so_method(so, screen->rankine, NV34TCL_DMA_TEXTURE0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->gart->handle);
+	so_method(so, screen->rankine, NV34TCL_DMA_COLOR1, 1);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->rankine, NV34TCL_DMA_COLOR0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->rankine, NV34TCL_DMA_VTXBUF0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->gart->handle);
+/*	so_method(so, screen->rankine, NV34TCL_DMA_FENCE, 2);
+	so_data  (so, 0);
+	so_data  (so, screen->query->handle);*/
+	so_method(so, screen->rankine, NV34TCL_DMA_IN_MEMORY7, 1);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->rankine, NV34TCL_DMA_IN_MEMORY8, 1);
+	so_data  (so, nvws->channel->vram->handle);
+
+	for (i=1; i<8; i++) {
+		so_method(so, screen->rankine, NV34TCL_VIEWPORT_CLIP_HORIZ(i), 1);
+		so_data  (so, 0);
+		so_method(so, screen->rankine, NV34TCL_VIEWPORT_CLIP_VERT(i), 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, screen->rankine, 0x220, 1);
+	so_data  (so, 1);
+
+	so_method(so, screen->rankine, 0x03b0, 1);
+	so_data  (so, 0x00100000);
+	so_method(so, screen->rankine, 0x1454, 1);
+	so_data  (so, 0);
+	so_method(so, screen->rankine, 0x1d80, 1);
+	so_data  (so, 3);
+	so_method(so, screen->rankine, 0x1450, 1);
+	so_data  (so, 0x00030004);
+	
+	/* NEW */
+	so_method(so, screen->rankine, 0x1e98, 1);
+	so_data  (so, 0);
+	so_method(so, screen->rankine, 0x17e0, 3);
+	so_data  (so, fui(0.0));
+	so_data  (so, fui(0.0));
+	so_data  (so, fui(1.0));
+	so_method(so, screen->rankine, 0x1f80, 16);
+	for (i=0; i<16; i++) {
+		so_data  (so, (i==8) ? 0x0000ffff : 0);
+	}
+
+	so_method(so, screen->rankine, 0x120, 3);
+	so_data  (so, 0);
+	so_data  (so, 1);
+	so_data  (so, 2);
+
+	so_method(so, screen->rankine, 0x1d88, 1);
+	so_data  (so, 0x00001200);
+
+	so_method(so, screen->rankine, NV34TCL_RC_ENABLE, 1);
+	so_data  (so, 0);
+
+	so_method(so, screen->rankine, NV34TCL_DEPTH_RANGE_NEAR, 2);
+	so_data  (so, fui(0.0));
+	so_data  (so, fui(1.0));
+
+	so_method(so, screen->rankine, NV34TCL_MULTISAMPLE_CONTROL, 1);
+	so_data  (so, 0xffff0000);
+
+	/* enables use of vp rather than fixed-function somehow */
+	so_method(so, screen->rankine, 0x1e94, 1);
+	so_data  (so, 0x13);
+
+	so_emit(nvws, so);
+	so_ref(NULL, &so);
+	nvws->push_flush(nvws, 0, NULL);
+
+	screen->pipe.winsys = ws;
+	screen->pipe.destroy = nv30_screen_destroy;
+
+	screen->pipe.get_name = nv30_screen_get_name;
+	screen->pipe.get_vendor = nv30_screen_get_vendor;
+	screen->pipe.get_param = nv30_screen_get_param;
+	screen->pipe.get_paramf = nv30_screen_get_paramf;
+
+	screen->pipe.is_format_supported = nv30_screen_surface_format_supported;
+
+	screen->pipe.surface_map = nv30_surface_map;
+	screen->pipe.surface_unmap = nv30_surface_unmap;
+
+	nv30_screen_init_miptree_functions(&screen->pipe);
+
+	return &screen->pipe;
+}
+
diff --git a/src/gallium/drivers/nv30/nv30_screen.h b/src/gallium/drivers/nv30/nv30_screen.h
new file mode 100644
index 00000000000..b7ddc2a9594
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_screen.h
@@ -0,0 +1,35 @@
+#ifndef __NV30_SCREEN_H__
+#define __NV30_SCREEN_H__
+
+#include "pipe/p_screen.h"
+
+struct nv30_screen {
+	struct pipe_screen pipe;
+
+	struct nouveau_winsys *nvws;
+
+	unsigned cur_pctx;
+
+	/* HW graphics objects */
+	struct nouveau_grobj *rankine;
+	struct nouveau_notifier *sync;
+
+	/* Query object resources */
+	struct nouveau_notifier *query;
+	struct nouveau_resource *query_heap;
+
+	/* Vtxprog resources */
+	struct nouveau_resource *vp_exec_heap;
+	struct nouveau_resource *vp_data_heap;
+
+	/* Current 3D state of channel */
+	struct nouveau_stateobj *state[NV30_STATE_MAX];
+};
+
+static INLINE struct nv30_screen *
+nv30_screen(struct pipe_screen *screen)
+{
+	return (struct nv30_screen *)screen;
+}
+
+#endif
diff --git a/src/gallium/drivers/nv30/nv30_shader.h b/src/gallium/drivers/nv30/nv30_shader.h
new file mode 100644
index 00000000000..dd3a36f78f3
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_shader.h
@@ -0,0 +1,490 @@
+#ifndef __NV30_SHADER_H__
+#define __NV30_SHADER_H__
+
+/* Vertex programs instruction set
+ *
+ * 128bit opcodes, split into 4 32-bit ones for ease of use.
+ *
+ * Non-native instructions
+ *   ABS - MOV + NV40_VP_INST0_DEST_ABS
+ *   POW - EX2 + MUL + LG2
+ *   SUB - ADD, second source negated
+ *   SWZ - MOV
+ *   XPD -  
+ *
+ * Register access
+ *   - Only one INPUT can be accessed per-instruction (move extras into TEMPs)
+ *   - Only one CONST can be accessed per-instruction (move extras into TEMPs)
+ *
+ * Relative Addressing
+ *   According to the value returned for
+ *   MAX_PROGRAM_NATIVE_ADDRESS_REGISTERS_ARB
+ *
+ *   there are only two address registers available.  The destination in the
+ *   ARL instruction is set to TEMP <n> (The temp isn't actually written).
+ *
+ *   When using vanilla ARB_v_p, the proprietary driver will squish both the
+ *   available ADDRESS regs into the first hardware reg in the X and Y
+ *   components.
+ *
+ *   To use an address reg as an index into consts, the CONST_SRC is set to
+ *   (const_base + offset) and INDEX_CONST is set.
+ *
+ *   To access the second address reg use ADDR_REG_SELECT_1. A particular
+ *   component of the address regs is selected with ADDR_SWZ.
+ *
+ *   Only one address register can be accessed per instruction.
+ *
+ * Conditional execution (see NV_vertex_program{2,3} for details) Conditional
+ * execution of an instruction is enabled by setting COND_TEST_ENABLE, and
+ * selecting the condition which will allow the test to pass with
+ * COND_{FL,LT,...}.  It is possible to swizzle the values in the condition
+ * register, which allows for testing against an individual component.
+ *
+ * Branching:
+ *
+ *   The BRA/CAL instructions seem to follow a slightly different opcode
+ *   layout.  The destination instruction ID (IADDR) overlaps a source field.
+ *   Instruction ID's seem to be numbered based on the UPLOAD_FROM_ID FIFO
+ *   command, and is incremented automatically on each UPLOAD_INST FIFO
+ *   command.
+ *
+ *   Conditional branching is achieved by using the condition tests described
+ *   above.  There doesn't appear to be dedicated looping instructions, but
+ *   this can be done using a temp reg + conditional branching.
+ *
+ *   Subroutines may be uploaded before the main program itself, but the first
+ *   executed instruction is determined by the PROGRAM_START_ID FIFO command.
+ *
+ */
+
+/* DWORD 0 */
+
+#define NV30_VP_INST_ADDR_REG_SELECT_1        (1 << 24)
+#define NV30_VP_INST_SRC2_ABS           (1 << 23) /* guess */
+#define NV30_VP_INST_SRC1_ABS           (1 << 22) /* guess */
+#define NV30_VP_INST_SRC0_ABS           (1 << 21) /* guess */
+#define NV30_VP_INST_VEC_RESULT         (1 << 20)
+#define NV30_VP_INST_DEST_TEMP_ID_SHIFT        16
+#define NV30_VP_INST_DEST_TEMP_ID_MASK        (0x0F << 16)
+#define NV30_VP_INST_COND_UPDATE_ENABLE        (1<<15)
+#define NV30_VP_INST_VEC_DEST_TEMP_MASK      (0xF << 16)
+#define NV30_VP_INST_COND_TEST_ENABLE        (1<<14)
+#define NV30_VP_INST_COND_SHIFT          11
+#define NV30_VP_INST_COND_MASK          (0x07 << 11)
+#  define NV30_VP_INST_COND_FL  0 /* guess */  
+#  define NV30_VP_INST_COND_LT  1  
+#  define NV30_VP_INST_COND_EQ  2
+#  define NV30_VP_INST_COND_LE  3
+#  define NV30_VP_INST_COND_GT  4
+#  define NV30_VP_INST_COND_NE  5
+#  define NV30_VP_INST_COND_GE  6
+#  define NV30_VP_INST_COND_TR  7 /* guess */
+#define NV30_VP_INST_COND_SWZ_X_SHIFT        9
+#define NV30_VP_INST_COND_SWZ_X_MASK        (0x03 <<  9)
+#define NV30_VP_INST_COND_SWZ_Y_SHIFT        7
+#define NV30_VP_INST_COND_SWZ_Y_MASK        (0x03 <<  7)
+#define NV30_VP_INST_COND_SWZ_Z_SHIFT        5
+#define NV30_VP_INST_COND_SWZ_Z_MASK        (0x03 <<  5)
+#define NV30_VP_INST_COND_SWZ_W_SHIFT        3
+#define NV30_VP_INST_COND_SWZ_W_MASK        (0x03 <<  3)
+#define NV30_VP_INST_COND_SWZ_ALL_SHIFT        3
+#define NV30_VP_INST_COND_SWZ_ALL_MASK        (0xFF <<  3)
+#define NV30_VP_INST_ADDR_SWZ_SHIFT        1
+#define NV30_VP_INST_ADDR_SWZ_MASK        (0x03 <<  1)
+#define NV30_VP_INST_SCA_OPCODEH_SHIFT        0
+#define NV30_VP_INST_SCA_OPCODEH_MASK        (0x01 <<  0)
+
+/* DWORD 1 */
+#define NV30_VP_INST_SCA_OPCODEL_SHIFT        28
+#define NV30_VP_INST_SCA_OPCODEL_MASK        (0x0F << 28)
+#  define NV30_VP_INST_OP_NOP  0x00
+#  define NV30_VP_INST_OP_RCP  0x02
+#  define NV30_VP_INST_OP_RCC  0x03
+#  define NV30_VP_INST_OP_RSQ  0x04
+#  define NV30_VP_INST_OP_EXP  0x05
+#  define NV30_VP_INST_OP_LOG  0x06
+#  define NV30_VP_INST_OP_LIT  0x07
+#  define NV30_VP_INST_OP_BRA  0x09
+#  define NV30_VP_INST_OP_CAL  0x0B
+#  define NV30_VP_INST_OP_RET  0x0C
+#  define NV30_VP_INST_OP_LG2  0x0D
+#  define NV30_VP_INST_OP_EX2  0x0E
+#  define NV30_VP_INST_OP_SIN  0x0F
+#  define NV30_VP_INST_OP_COS  0x10
+#define NV30_VP_INST_VEC_OPCODE_SHIFT        23
+#define NV30_VP_INST_VEC_OPCODE_MASK        (0x1F << 23)
+#  define NV30_VP_INST_OP_NOPV  0x00
+#  define NV30_VP_INST_OP_MOV  0x01
+#  define NV30_VP_INST_OP_MUL  0x02
+#  define NV30_VP_INST_OP_ADD  0x03
+#  define NV30_VP_INST_OP_MAD  0x04
+#  define NV30_VP_INST_OP_DP3  0x05
+#  define NV30_VP_INST_OP_DP4  0x07
+#  define NV30_VP_INST_OP_DPH  0x06
+#  define NV30_VP_INST_OP_DST  0x08
+#  define NV30_VP_INST_OP_MIN  0x09
+#  define NV30_VP_INST_OP_MAX  0x0A
+#  define NV30_VP_INST_OP_SLT  0x0B
+#  define NV30_VP_INST_OP_SGE  0x0C
+#  define NV30_VP_INST_OP_ARL  0x0D
+#  define NV30_VP_INST_OP_FRC  0x0E
+#  define NV30_VP_INST_OP_FLR  0x0F
+#  define NV30_VP_INST_OP_SEQ  0x10
+#  define NV30_VP_INST_OP_SFL  0x11
+#  define NV30_VP_INST_OP_SGT  0x12
+#  define NV30_VP_INST_OP_SLE  0x13
+#  define NV30_VP_INST_OP_SNE  0x14
+#  define NV30_VP_INST_OP_STR  0x15
+#  define NV30_VP_INST_OP_SSG  0x16
+#  define NV30_VP_INST_OP_ARR  0x17
+#  define NV30_VP_INST_OP_ARA  0x18
+#define NV30_VP_INST_CONST_SRC_SHIFT        14
+#define NV30_VP_INST_CONST_SRC_MASK        (0xFF << 14)
+#define NV30_VP_INST_INPUT_SRC_SHIFT        9    /*NV20*/
+#define NV30_VP_INST_INPUT_SRC_MASK        (0x0F <<  9)  /*NV20*/
+#  define NV30_VP_INST_IN_POS  0    /* These seem to match the bindings specified in */
+#  define NV30_VP_INST_IN_WEIGHT  1    /* the ARB_v_p spec (2.14.3.1) */
+#  define NV30_VP_INST_IN_NORMAL  2    
+#  define NV30_VP_INST_IN_COL0  3    /* Should probably confirm them all though */
+#  define NV30_VP_INST_IN_COL1  4
+#  define NV30_VP_INST_IN_FOGC  5
+#  define NV30_VP_INST_IN_TC0  8
+#  define NV30_VP_INST_IN_TC(n)  (8+n)
+#define NV30_VP_INST_SRC0H_SHIFT        0    /*NV20*/
+#define NV30_VP_INST_SRC0H_MASK          (0x1FF << 0)  /*NV20*/
+
+/* Please note: the IADDR fields overlap other fields because they are used
+ * only for branch instructions.  See Branching: label above
+ *
+ * DWORD 2
+ */
+#define NV30_VP_INST_SRC0L_SHIFT        26    /*NV20*/
+#define NV30_VP_INST_SRC0L_MASK         (0x3F  <<26)  /* NV30_VP_SRC0_LOW_MASK << 26 */
+#define NV30_VP_INST_SRC1_SHIFT         11    /*NV20*/
+#define NV30_VP_INST_SRC1_MASK          (0x7FFF<<11)  /*NV20*/
+#define NV30_VP_INST_SRC2H_SHIFT        0    /*NV20*/
+#define NV30_VP_INST_SRC2H_MASK          (0x7FF << 0)  /* NV30_VP_SRC2_HIGH_MASK >> 4*/
+#define NV30_VP_INST_IADDR_SHIFT        2
+#define NV30_VP_INST_IADDR_MASK          (0xF <<  28)   /* NV30_VP_SRC2_LOW_MASK << 28 */
+
+/* DWORD 3 */
+#define NV30_VP_INST_SRC2L_SHIFT        28    /*NV20*/
+#define NV30_VP_INST_SRC2L_MASK          (0x0F  <<28)  /*NV20*/
+#define NV30_VP_INST_STEMP_WRITEMASK_SHIFT      24
+#define NV30_VP_INST_STEMP_WRITEMASK_MASK      (0x0F << 24)
+#define NV30_VP_INST_VTEMP_WRITEMASK_SHIFT      20
+#define NV30_VP_INST_VTEMP_WRITEMASK_MASK      (0x0F << 20)
+#define NV30_VP_INST_SDEST_WRITEMASK_SHIFT      16
+#define NV30_VP_INST_SDEST_WRITEMASK_MASK      (0x0F << 16)
+#define NV30_VP_INST_VDEST_WRITEMASK_SHIFT      12    /*NV20*/
+#define NV30_VP_INST_VDEST_WRITEMASK_MASK      (0x0F << 12)  /*NV20*/
+#define NV30_VP_INST_DEST_SHIFT        2
+#define NV30_VP_INST_DEST_MASK        (0x0F <<  2)
+#  define NV30_VP_INST_DEST_POS  0
+#  define NV30_VP_INST_DEST_BFC0  1
+#  define NV30_VP_INST_DEST_BFC1  2
+#  define NV30_VP_INST_DEST_COL0  3
+#  define NV30_VP_INST_DEST_COL1  4
+#  define NV30_VP_INST_DEST_FOGC  5
+#  define NV30_VP_INST_DEST_PSZ   6
+#  define NV30_VP_INST_DEST_TC(n)  (8+n)
+
+#define NV30_VP_INST_LAST                           (1 << 0)
+
+/* Useful to split the source selection regs into their pieces */
+#define NV30_VP_SRC0_HIGH_SHIFT                                                6
+#define NV30_VP_SRC0_HIGH_MASK                                        0x00007FC0
+#define NV30_VP_SRC0_LOW_MASK                                         0x0000003F
+#define NV30_VP_SRC2_HIGH_SHIFT                                                4
+#define NV30_VP_SRC2_HIGH_MASK                                        0x00007FF0
+#define NV30_VP_SRC2_LOW_MASK                                         0x0000000F
+
+
+/* Source-register definition - matches NV20 exactly */
+#define NV30_VP_SRC_NEGATE          (1<<14)
+#define NV30_VP_SRC_SWZ_X_SHIFT        12
+#define NV30_VP_SRC_REG_SWZ_X_MASK        (0x03  <<12)
+#define NV30_VP_SRC_SWZ_Y_SHIFT        10
+#define NV30_VP_SRC_REG_SWZ_Y_MASK        (0x03  <<10)
+#define NV30_VP_SRC_SWZ_Z_SHIFT        8
+#define NV30_VP_SRC_REG_SWZ_Z_MASK        (0x03  << 8)
+#define NV30_VP_SRC_SWZ_W_SHIFT        6
+#define NV30_VP_SRC_REG_SWZ_W_MASK        (0x03  << 6)
+#define NV30_VP_SRC_REG_SWZ_ALL_SHIFT        6
+#define NV30_VP_SRC_REG_SWZ_ALL_MASK        (0xFF  << 6)
+#define NV30_VP_SRC_TEMP_SRC_SHIFT        2
+#define NV30_VP_SRC_REG_TEMP_ID_MASK        (0x0F  << 0)
+#define NV30_VP_SRC_REG_TYPE_SHIFT        0
+#define NV30_VP_SRC_REG_TYPE_MASK        (0x03  << 0)
+#define NV30_VP_SRC_REG_TYPE_TEMP  1
+#define NV30_VP_SRC_REG_TYPE_INPUT  2
+#define NV30_VP_SRC_REG_TYPE_CONST  3 /* guess */
+
+/*
+ * Each fragment program opcode appears to be comprised of 4 32-bit values.
+ *
+ *   0 - Opcode, output reg/mask, ATTRIB source
+ *   1 - Source 0
+ *   2 - Source 1
+ *   3 - Source 2
+ *
+ * There appears to be no special difference between result regs and temp regs.
+ *     result.color == R0.xyzw
+ *     result.depth == R1.z
+ * When the fragprog contains instructions to write depth, NV30_TCL_PRIMITIVE_3D_UNK1D78=0
+ * otherwise it is set to 1.
+ *
+ * Constants are inserted directly after the instruction that uses them.
+ * 
+ * It appears that it's not possible to use two input registers in one
+ * instruction as the input sourcing is done in the instruction dword
+ * and not the source selection dwords.  As such instructions such as:
+ * 
+ *     ADD result.color, fragment.color, fragment.texcoord[0];
+ *
+ * must be split into two MOV's and then an ADD (nvidia does this) but
+ * I'm not sure why it's not just one MOV and then source the second input
+ * in the ADD instruction..
+ *
+ * Negation of the full source is done with NV30_FP_REG_NEGATE, arbitrary
+ * negation requires multiplication with a const.
+ *
+ * Arbitrary swizzling is supported with the exception of SWIZZLE_ZERO/SWIZZLE_ONE
+ * The temp/result regs appear to be initialised to (0.0, 0.0, 0.0, 0.0) as SWIZZLE_ZERO
+ * is implemented simply by not writing to the relevant components of the destination.
+ *
+ * Conditional execution
+ *   TODO
+ * 
+ * Non-native instructions:
+ *   LIT
+ *   LRP - MAD+MAD
+ *   SUB - ADD, negate second source
+ *   RSQ - LG2 + EX2
+ *   POW - LG2 + MUL + EX2
+ *   SCS - COS + SIN
+ *   XPD
+ */
+
+//== Opcode / Destination selection ==
+#define NV30_FP_OP_PROGRAM_END          (1 << 0)
+#define NV30_FP_OP_OUT_REG_SHIFT        1
+#define NV30_FP_OP_OUT_REG_MASK          (31 << 1)  /* uncertain */
+/* Needs to be set when writing outputs to get expected result.. */
+#define NV30_FP_OP_OUT_REG_HALF          (1 << 7)
+#define NV30_FP_OP_COND_WRITE_ENABLE        (1 << 8)
+#define NV30_FP_OP_OUTMASK_SHIFT        9
+#define NV30_FP_OP_OUTMASK_MASK          (0xF << 9)
+#  define NV30_FP_OP_OUT_X  (1<<9)
+#  define NV30_FP_OP_OUT_Y  (1<<10)
+#  define NV30_FP_OP_OUT_Z  (1<<11)
+#  define NV30_FP_OP_OUT_W  (1<<12)
+/* Uncertain about these, especially the input_src values.. it's possible that
+ * they can be dynamically changed.
+ */
+#define NV30_FP_OP_INPUT_SRC_SHIFT        13
+#define NV30_FP_OP_INPUT_SRC_MASK        (15 << 13)
+#  define NV30_FP_OP_INPUT_SRC_POSITION  0x0
+#  define NV30_FP_OP_INPUT_SRC_COL0  0x1
+#  define NV30_FP_OP_INPUT_SRC_COL1  0x2
+#  define NV30_FP_OP_INPUT_SRC_FOGC  0x3
+#  define NV30_FP_OP_INPUT_SRC_TC0    0x4
+#  define NV30_FP_OP_INPUT_SRC_TC(n)  (0x4 + n)
+#define NV30_FP_OP_TEX_UNIT_SHIFT        17
+#define NV30_FP_OP_TEX_UNIT_MASK        (0xF << 17) /* guess */
+#define NV30_FP_OP_PRECISION_SHIFT        22
+#define NV30_FP_OP_PRECISION_MASK        (3 << 22)
+#   define NV30_FP_PRECISION_FP32  0
+#   define NV30_FP_PRECISION_FP16  1
+#   define NV30_FP_PRECISION_FX12  2
+#define NV30_FP_OP_OPCODE_SHIFT          24
+#define NV30_FP_OP_OPCODE_MASK          (0x3F << 24)
+#  define NV30_FP_OP_OPCODE_NOP  0x00
+#  define NV30_FP_OP_OPCODE_MOV  0x01
+#  define NV30_FP_OP_OPCODE_MUL  0x02
+#  define NV30_FP_OP_OPCODE_ADD  0x03
+#  define NV30_FP_OP_OPCODE_MAD  0x04
+#  define NV30_FP_OP_OPCODE_DP3  0x05
+#  define NV30_FP_OP_OPCODE_DP4  0x06
+#  define NV30_FP_OP_OPCODE_DST  0x07
+#  define NV30_FP_OP_OPCODE_MIN  0x08
+#  define NV30_FP_OP_OPCODE_MAX  0x09
+#  define NV30_FP_OP_OPCODE_SLT  0x0A
+#  define NV30_FP_OP_OPCODE_SGE  0x0B
+#  define NV30_FP_OP_OPCODE_SLE  0x0C
+#  define NV30_FP_OP_OPCODE_SGT  0x0D
+#  define NV30_FP_OP_OPCODE_SNE  0x0E
+#  define NV30_FP_OP_OPCODE_SEQ  0x0F
+#  define NV30_FP_OP_OPCODE_FRC  0x10
+#  define NV30_FP_OP_OPCODE_FLR  0x11
+#  define NV30_FP_OP_OPCODE_KIL  0x12
+#  define NV30_FP_OP_OPCODE_PK4B   0x13
+#  define NV30_FP_OP_OPCODE_UP4B   0x14
+#  define NV30_FP_OP_OPCODE_DDX  0x15 /* can only write XY */
+#  define NV30_FP_OP_OPCODE_DDY  0x16 /* can only write XY */
+#  define NV30_FP_OP_OPCODE_TEX  0x17
+#  define NV30_FP_OP_OPCODE_TXP  0x18
+#  define NV30_FP_OP_OPCODE_TXD  0x19
+#  define NV30_FP_OP_OPCODE_RCP  0x1A
+#  define NV30_FP_OP_OPCODE_RSQ  0x1B
+#  define NV30_FP_OP_OPCODE_EX2  0x1C
+#  define NV30_FP_OP_OPCODE_LG2  0x1D
+#  define NV30_FP_OP_OPCODE_LIT  0x1E
+#  define NV30_FP_OP_OPCODE_LRP  0x1F
+#  define NV30_FP_OP_OPCODE_STR  0x20 
+#  define NV30_FP_OP_OPCODE_SFL  0x21
+#  define NV30_FP_OP_OPCODE_COS  0x22
+#  define NV30_FP_OP_OPCODE_SIN  0x23
+#  define NV30_FP_OP_OPCODE_PK2H   0x24
+#  define NV30_FP_OP_OPCODE_UP2H   0x25
+#  define NV30_FP_OP_OPCODE_POW  0x26
+#  define NV30_FP_OP_OPCODE_PK4UB  0x27
+#  define NV30_FP_OP_OPCODE_UP4UB  0x28
+#  define NV30_FP_OP_OPCODE_PK2US  0x29
+#  define NV30_FP_OP_OPCODE_UP2US  0x2A
+#  define NV30_FP_OP_OPCODE_DP2A   0x2E
+#  define NV30_FP_OP_OPCODE_TXB  0x31
+#  define NV30_FP_OP_OPCODE_RFL  0x36
+#  define NV30_FP_OP_OPCODE_DIV  0x3A
+#define NV30_FP_OP_OUT_SAT          (1 << 31)
+
+/* high order bits of SRC0 */
+#define NV30_FP_OP_OUT_ABS          (1 << 29)
+#define NV30_FP_OP_COND_SWZ_W_SHIFT        27
+#define NV30_FP_OP_COND_SWZ_W_MASK        (3 << 27)
+#define NV30_FP_OP_COND_SWZ_Z_SHIFT        25
+#define NV30_FP_OP_COND_SWZ_Z_MASK        (3 << 25)
+#define NV30_FP_OP_COND_SWZ_Y_SHIFT        23
+#define NV30_FP_OP_COND_SWZ_Y_MASK        (3 << 23)
+#define NV30_FP_OP_COND_SWZ_X_SHIFT        21
+#define NV30_FP_OP_COND_SWZ_X_MASK        (3 << 21)
+#define NV30_FP_OP_COND_SWZ_ALL_SHIFT        21
+#define NV30_FP_OP_COND_SWZ_ALL_MASK        (0xFF << 21)
+#define NV30_FP_OP_COND_SHIFT          18
+#define NV30_FP_OP_COND_MASK          (0x07 << 18)
+#  define NV30_FP_OP_COND_FL  0
+#  define NV30_FP_OP_COND_LT  1
+#  define NV30_FP_OP_COND_EQ  2
+#  define NV30_FP_OP_COND_LE  3
+#  define NV30_FP_OP_COND_GT  4
+#  define NV30_FP_OP_COND_NE  5
+#  define NV30_FP_OP_COND_GE  6
+#  define NV30_FP_OP_COND_TR  7
+
+/* high order bits of SRC1 */
+#define NV30_FP_OP_DST_SCALE_SHIFT        28
+#define NV30_FP_OP_DST_SCALE_MASK        (3 << 28)
+#define NV30_FP_OP_DST_SCALE_1X                                                0
+#define NV30_FP_OP_DST_SCALE_2X                                                1
+#define NV30_FP_OP_DST_SCALE_4X                                                2
+#define NV30_FP_OP_DST_SCALE_8X                                                3
+#define NV30_FP_OP_DST_SCALE_INV_2X                                            5
+#define NV30_FP_OP_DST_SCALE_INV_4X                                            6
+#define NV30_FP_OP_DST_SCALE_INV_8X                                            7
+
+
+/* high order bits of SRC2 */
+#define NV30_FP_OP_INDEX_INPUT          (1 << 30)
+
+//== Register selection ==
+#define NV30_FP_REG_TYPE_SHIFT          0
+#define NV30_FP_REG_TYPE_MASK          (3 << 0)
+#  define NV30_FP_REG_TYPE_TEMP  0
+#  define NV30_FP_REG_TYPE_INPUT  1
+#  define NV30_FP_REG_TYPE_CONST  2
+#define NV30_FP_REG_SRC_SHIFT          2 /* uncertain */
+#define NV30_FP_REG_SRC_MASK          (31 << 2)
+#define NV30_FP_REG_SRC_HALF          (1 << 8)
+#define NV30_FP_REG_SWZ_ALL_SHIFT        9
+#define NV30_FP_REG_SWZ_ALL_MASK        (255 << 9)
+#define NV30_FP_REG_SWZ_X_SHIFT          9
+#define NV30_FP_REG_SWZ_X_MASK          (3 << 9)
+#define NV30_FP_REG_SWZ_Y_SHIFT          11
+#define NV30_FP_REG_SWZ_Y_MASK          (3 << 11)
+#define NV30_FP_REG_SWZ_Z_SHIFT          13
+#define NV30_FP_REG_SWZ_Z_MASK          (3 << 13)
+#define NV30_FP_REG_SWZ_W_SHIFT          15
+#define NV30_FP_REG_SWZ_W_MASK          (3 << 15)
+#  define NV30_FP_SWIZZLE_X  0
+#  define NV30_FP_SWIZZLE_Y  1
+#  define NV30_FP_SWIZZLE_Z  2
+#  define NV30_FP_SWIZZLE_W  3
+#define NV30_FP_REG_NEGATE          (1 << 17)
+
+#define NV30SR_NONE	0
+#define NV30SR_OUTPUT	1
+#define NV30SR_INPUT	2
+#define NV30SR_TEMP	3
+#define NV30SR_CONST	4
+
+struct nv30_sreg {
+	int type;
+	int index;
+
+	int dst_scale;
+
+	int negate;
+	int abs;
+	int swz[4];
+
+	int cc_update;
+	int cc_update_reg;
+	int cc_test;
+	int cc_test_reg;
+	int cc_swz[4];
+};
+
+static INLINE struct nv30_sreg
+nv30_sr(int type, int index)
+{
+	struct nv30_sreg temp = {
+		.type = type,
+		.index = index,
+		.dst_scale = DEF_SCALE,
+		.abs = 0,
+		.negate = 0,
+		.swz = { 0, 1, 2, 3 },
+		.cc_update = 0,
+		.cc_update_reg = 0,
+		.cc_test = DEF_CTEST,
+		.cc_test_reg = 0,
+		.cc_swz = { 0, 1, 2, 3 },
+	};
+	return temp;
+}
+
+static INLINE struct nv30_sreg
+nv30_sr_swz(struct nv30_sreg src, int x, int y, int z, int w)
+{
+	struct nv30_sreg dst = src;
+
+	dst.swz[SWZ_X] = src.swz[x];
+	dst.swz[SWZ_Y] = src.swz[y];
+	dst.swz[SWZ_Z] = src.swz[z];
+	dst.swz[SWZ_W] = src.swz[w];
+	return dst;
+}
+
+static INLINE struct nv30_sreg
+nv30_sr_neg(struct nv30_sreg src)
+{
+	src.negate = !src.negate;
+	return src;
+}
+
+static INLINE struct nv30_sreg
+nv30_sr_abs(struct nv30_sreg src)
+{
+	src.abs = 1;
+	return src;
+}
+
+static INLINE struct nv30_sreg
+nv30_sr_scale(struct nv30_sreg src, int scale)
+{
+	src.dst_scale = scale;
+	return src;
+}
+
+#endif
diff --git a/src/gallium/drivers/nv30/nv30_state.c b/src/gallium/drivers/nv30/nv30_state.c
new file mode 100644
index 00000000000..fc66075c83f
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state.c
@@ -0,0 +1,724 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv30_context.h"
+#include "nv30_state.h"
+
+static void *
+nv30_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+	struct nv30_blend_state *bso = CALLOC(1, sizeof(*bso));
+	struct nouveau_stateobj *so = so_new(16, 0);
+
+	if (cso->blend_enable) {
+		so_method(so, rankine, NV34TCL_BLEND_FUNC_ENABLE, 3);
+		so_data  (so, 1);
+		so_data  (so, (nvgl_blend_func(cso->alpha_src_factor) << 16) |
+			       nvgl_blend_func(cso->rgb_src_factor));
+		so_data  (so, nvgl_blend_func(cso->alpha_dst_factor) << 16 |
+			      nvgl_blend_func(cso->rgb_dst_factor));
+		so_method(so, rankine, NV34TCL_BLEND_EQUATION, 1);
+		so_data  (so, nvgl_blend_eqn(cso->alpha_func) << 16 |
+			      nvgl_blend_eqn(cso->rgb_func));
+	} else {
+		so_method(so, rankine, NV34TCL_BLEND_FUNC_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, rankine, NV34TCL_COLOR_MASK, 1);
+	so_data  (so, (((cso->colormask & PIPE_MASK_A) ? (0x01 << 24) : 0) |
+		       ((cso->colormask & PIPE_MASK_R) ? (0x01 << 16) : 0) |
+		       ((cso->colormask & PIPE_MASK_G) ? (0x01 <<  8) : 0) |
+		       ((cso->colormask & PIPE_MASK_B) ? (0x01 <<  0) : 0)));
+
+	if (cso->logicop_enable) {
+		so_method(so, rankine, NV34TCL_COLOR_LOGIC_OP_ENABLE, 2);
+		so_data  (so, 1);
+		so_data  (so, nvgl_logicop_func(cso->logicop_func));
+	} else {
+		so_method(so, rankine, NV34TCL_COLOR_LOGIC_OP_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, rankine, NV34TCL_DITHER_ENABLE, 1);
+	so_data  (so, cso->dither ? 1 : 0);
+
+	so_ref(so, &bso->so);
+	bso->pipe = *cso;
+	return (void *)bso;
+}
+
+static void
+nv30_blend_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->blend = hwcso;
+	nv30->dirty |= NV30_NEW_BLEND;
+}
+
+static void
+nv30_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_blend_state *bso = hwcso;
+
+	so_ref(NULL, &bso->so);
+	FREE(bso);
+}
+
+
+static INLINE unsigned
+wrap_mode(unsigned wrap) {
+	unsigned ret;
+
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		ret = NV34TCL_TX_WRAP_S_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		ret = NV34TCL_TX_WRAP_S_MIRRORED_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		ret = NV34TCL_TX_WRAP_S_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		ret = NV34TCL_TX_WRAP_S_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_CLAMP:
+		ret = NV34TCL_TX_WRAP_S_CLAMP;
+		break;
+/*	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+		ret = NV34TCL_TX_WRAP_S_MIRROR_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+		ret = NV34TCL_TX_WRAP_S_MIRROR_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+		ret = NV34TCL_TX_WRAP_S_MIRROR_CLAMP;
+		break;*/
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		ret = NV34TCL_TX_WRAP_S_REPEAT;
+		break;
+	}
+
+	return ret >> NV34TCL_TX_WRAP_S_SHIFT;
+}
+
+static void *
+nv30_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	struct nv30_sampler_state *ps;
+	uint32_t filter = 0;
+
+	ps = MALLOC(sizeof(struct nv30_sampler_state));
+
+	ps->fmt = 0;
+	/* TODO: Not all RECTs formats have this bit set, bits 15-8 of format
+	   are the tx format to use. We should store normalized coord flag
+	   in sampler state structure, and set appropriate format in
+	   nvxx_fragtex_build()
+	 */
+	/*NV34TCL_TX_FORMAT_RECT*/
+	/*if (!cso->normalized_coords) {
+		ps->fmt |= (1<<14) ;
+	}*/
+
+	ps->wrap = ((wrap_mode(cso->wrap_s) << NV34TCL_TX_WRAP_S_SHIFT) |
+		    (wrap_mode(cso->wrap_t) << NV34TCL_TX_WRAP_T_SHIFT) |
+		    (wrap_mode(cso->wrap_r) << NV34TCL_TX_WRAP_R_SHIFT));
+
+	ps->en = 0;
+
+	if (cso->max_anisotropy >= 8.0) {
+		ps->en |= NV34TCL_TX_ENABLE_ANISO_8X;
+	} else
+	if (cso->max_anisotropy >= 4.0) {
+		ps->en |= NV34TCL_TX_ENABLE_ANISO_4X;
+	} else
+	if (cso->max_anisotropy >= 2.0) {
+		ps->en |= NV34TCL_TX_ENABLE_ANISO_2X;
+	}
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		filter |= NV34TCL_TX_FILTER_MAGNIFY_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		filter |= NV34TCL_TX_FILTER_MAGNIFY_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV34TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST;
+			break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV34TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV34TCL_TX_FILTER_MINIFY_LINEAR;
+			break;
+		}
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV34TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST;
+		break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV34TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV34TCL_TX_FILTER_MINIFY_NEAREST;
+			break;
+		}
+		break;
+	}
+
+	ps->filt = filter;
+
+	{
+		float limit;
+
+		limit = CLAMP(cso->lod_bias, -16.0, 15.0);
+		ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff;
+
+		limit = CLAMP(cso->max_lod, 0.0, 15.0);
+		ps->en |= (int)(limit) << 14 /*NV34TCL_TX_ENABLE_MIPMAP_MAX_LOD_SHIFT*/;
+
+		limit = CLAMP(cso->min_lod, 0.0, 15.0);
+		ps->en |= (int)(limit) << 26 /*NV34TCL_TX_ENABLE_MIPMAP_MIN_LOD_SHIFT*/;
+	}
+
+	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+		switch (cso->compare_func) {
+		case PIPE_FUNC_NEVER:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_NEVER;
+			break;
+		case PIPE_FUNC_GREATER:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_GREATER;
+			break;
+		case PIPE_FUNC_EQUAL:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_EQUAL;
+			break;
+		case PIPE_FUNC_GEQUAL:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_GEQUAL;
+			break;
+		case PIPE_FUNC_LESS:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_LESS;
+			break;
+		case PIPE_FUNC_NOTEQUAL:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_NOTEQUAL;
+			break;
+		case PIPE_FUNC_LEQUAL:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_LEQUAL;
+			break;
+		case PIPE_FUNC_ALWAYS:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_ALWAYS;
+			break;
+		default:
+			break;
+		}
+	}
+
+	ps->bcol = ((float_to_ubyte(cso->border_color[3]) << 24) |
+		    (float_to_ubyte(cso->border_color[0]) << 16) |
+		    (float_to_ubyte(cso->border_color[1]) <<  8) |
+		    (float_to_ubyte(cso->border_color[2]) <<  0));
+
+	return (void *)ps;
+}
+
+static void
+nv30_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv30->tex_sampler[unit] = sampler[unit];
+		nv30->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nv30->nr_samplers; unit++) {
+		nv30->tex_sampler[unit] = NULL;
+		nv30->dirty_samplers |= (1 << unit);
+	}
+
+	nv30->nr_samplers = nr;
+	nv30->dirty |= NV30_NEW_SAMPLER;
+}
+
+static void
+nv30_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nv30_set_sampler_texture(struct pipe_context *pipe, unsigned nr,
+			 struct pipe_texture **miptree)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		pipe_texture_reference((struct pipe_texture **)
+				       &nv30->tex_miptree[unit], miptree[unit]);
+		nv30->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nv30->nr_textures; unit++) {
+		pipe_texture_reference((struct pipe_texture **)
+				       &nv30->tex_miptree[unit], NULL);
+		nv30->dirty_samplers |= (1 << unit);
+	}
+
+	nv30->nr_textures = nr;
+	nv30->dirty |= NV30_NEW_SAMPLER;
+}
+
+static void *
+nv30_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_rasterizer_state *rsso = CALLOC(1, sizeof(*rsso));
+	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+
+	/*XXX: ignored:
+	 * 	light_twoside
+	 * 	point_smooth -nohw
+	 * 	multisample
+	 */
+
+	so_method(so, rankine, NV34TCL_SHADE_MODEL, 1);
+	so_data  (so, cso->flatshade ? NV34TCL_SHADE_MODEL_FLAT :
+				       NV34TCL_SHADE_MODEL_SMOOTH);
+
+	so_method(so, rankine, NV34TCL_LINE_WIDTH, 2);
+	so_data  (so, (unsigned char)(cso->line_width * 8.0) & 0xff);
+	so_data  (so, cso->line_smooth ? 1 : 0);
+	so_method(so, rankine, NV34TCL_LINE_STIPPLE_ENABLE, 2);
+	so_data  (so, cso->line_stipple_enable ? 1 : 0);
+	so_data  (so, (cso->line_stipple_pattern << 16) |
+		       cso->line_stipple_factor);
+
+	so_method(so, rankine, NV34TCL_POINT_SIZE, 1);
+	so_data  (so, fui(cso->point_size));
+
+	so_method(so, rankine, NV34TCL_POLYGON_MODE_FRONT, 6);
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV34TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV34TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV34TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV34TCL_CULL_FACE_BACK);
+			break;
+		}
+		so_data(so, NV34TCL_FRONT_FACE_CCW);
+	} else {
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV34TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV34TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV34TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV34TCL_CULL_FACE_BACK);
+			break;
+		}
+		so_data(so, NV34TCL_FRONT_FACE_CW);
+	}
+	so_data(so, cso->poly_smooth ? 1 : 0);
+	so_data(so, (cso->cull_mode != PIPE_WINDING_NONE) ? 1 : 0);
+
+	so_method(so, rankine, NV34TCL_POLYGON_STIPPLE_ENABLE, 1);
+	so_data  (so, cso->poly_stipple_enable ? 1 : 0);
+
+	so_method(so, rankine, NV34TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_POINT) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_POINT))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_LINE) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_LINE))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_FILL) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_FILL))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if (cso->offset_cw || cso->offset_ccw) {
+		so_method(so, rankine, NV34TCL_POLYGON_OFFSET_FACTOR, 2);
+		so_data  (so, fui(cso->offset_scale));
+		so_data  (so, fui(cso->offset_units * 2));
+	}
+
+	so_method(so, rankine, NV34TCL_POINT_SPRITE, 1);
+	if (cso->point_sprite) {
+		unsigned psctl = (1 << 0), i;
+
+		for (i = 0; i < 8; i++) {
+			if (cso->sprite_coord_mode[i] != PIPE_SPRITE_COORD_NONE)
+				psctl |= (1 << (8 + i));
+		}
+
+		so_data(so, psctl);
+	} else {
+		so_data(so, 0);
+	}
+
+	so_ref(so, &rsso->so);
+	rsso->pipe = *cso;
+	return (void *)rsso;
+}
+
+static void
+nv30_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->rasterizer = hwcso;
+	nv30->dirty |= NV30_NEW_RAST;
+	/*nv30->draw_dirty |= NV30_NEW_RAST;*/
+}
+
+static void
+nv30_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_rasterizer_state *rsso = hwcso;
+
+	so_ref(NULL, &rsso->so);
+	FREE(rsso);
+}
+
+static void *
+nv30_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_zsa_state *zsaso = CALLOC(1, sizeof(*zsaso));
+	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+
+	so_method(so, rankine, NV34TCL_DEPTH_FUNC, 3);
+	so_data  (so, nvgl_comparison_op(cso->depth.func));
+	so_data  (so, cso->depth.writemask ? 1 : 0);
+	so_data  (so, cso->depth.enabled ? 1 : 0);
+
+	so_method(so, rankine, NV34TCL_ALPHA_FUNC_ENABLE, 3);
+	so_data  (so, cso->alpha.enabled ? 1 : 0);
+	so_data  (so, nvgl_comparison_op(cso->alpha.func));
+	so_data  (so, float_to_ubyte(cso->alpha.ref));
+
+	if (cso->stencil[0].enabled) {
+		so_method(so, rankine, NV34TCL_STENCIL_FRONT_ENABLE, 8);
+		so_data  (so, cso->stencil[0].enabled ? 1 : 0);
+		so_data  (so, cso->stencil[0].write_mask);
+		so_data  (so, nvgl_comparison_op(cso->stencil[0].func));
+		so_data  (so, cso->stencil[0].ref_value);
+		so_data  (so, cso->stencil[0].value_mask);
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zpass_op));
+	} else {
+		so_method(so, rankine, NV34TCL_STENCIL_FRONT_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	if (cso->stencil[1].enabled) {
+		so_method(so, rankine, NV34TCL_STENCIL_BACK_ENABLE, 8);
+		so_data  (so, cso->stencil[1].enabled ? 1 : 0);
+		so_data  (so, cso->stencil[1].write_mask);
+		so_data  (so, nvgl_comparison_op(cso->stencil[1].func));
+		so_data  (so, cso->stencil[1].ref_value);
+		so_data  (so, cso->stencil[1].value_mask);
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zpass_op));
+	} else {
+		so_method(so, rankine, NV34TCL_STENCIL_BACK_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_ref(so, &zsaso->so);
+	zsaso->pipe = *cso;
+	return (void *)zsaso;
+}
+
+static void
+nv30_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->zsa = hwcso;
+	nv30->dirty |= NV30_NEW_ZSA;
+}
+
+static void
+nv30_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_zsa_state *zsaso = hwcso;
+
+	so_ref(NULL, &zsaso->so);
+	FREE(zsaso);
+}
+
+static void *
+nv30_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	/*struct nv30_context *nv30 = nv30_context(pipe);*/
+	struct nv30_vertex_program *vp;
+
+	vp = CALLOC(1, sizeof(struct nv30_vertex_program));
+	vp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	/*vp->draw = draw_create_vertex_shader(nv30->draw, &vp->pipe);*/
+
+	return (void *)vp;
+}
+
+static void
+nv30_vp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->vertprog = hwcso;
+	nv30->dirty |= NV30_NEW_VERTPROG;
+	/*nv30->draw_dirty |= NV30_NEW_VERTPROG;*/
+}
+
+static void
+nv30_vp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_vertex_program *vp = hwcso;
+
+	/*draw_delete_vertex_shader(nv30->draw, vp->draw);*/
+	nv30_vertprog_destroy(nv30, vp);
+	FREE((void*)vp->pipe.tokens);
+	FREE(vp);
+}
+
+static void *
+nv30_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv30_fragment_program *fp;
+
+	fp = CALLOC(1, sizeof(struct nv30_fragment_program));
+	fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+
+	tgsi_scan_shader(fp->pipe.tokens, &fp->info);
+
+	return (void *)fp;
+}
+
+static void
+nv30_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->fragprog = hwcso;
+	nv30->dirty |= NV30_NEW_FRAGPROG;
+}
+
+static void
+nv30_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_fragment_program *fp = hwcso;
+
+	nv30_fragprog_destroy(nv30, fp);
+	FREE((void*)fp->pipe.tokens);
+	FREE(fp);
+}
+
+static void
+nv30_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->blend_colour = *bcol;
+	nv30->dirty |= NV30_NEW_BCOL;
+}
+
+static void
+nv30_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+}
+
+static void
+nv30_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->constbuf[shader] = buf->buffer;
+	nv30->constbuf_nr[shader] = buf->size / (4 * sizeof(float));
+
+	if (shader == PIPE_SHADER_VERTEX) {
+		nv30->dirty |= NV30_NEW_VERTPROG;
+	} else
+	if (shader == PIPE_SHADER_FRAGMENT) {
+		nv30->dirty |= NV30_NEW_FRAGPROG;
+	}
+}
+
+static void
+nv30_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->framebuffer = *fb;
+	nv30->dirty |= NV30_NEW_FB;
+}
+
+static void
+nv30_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	memcpy(nv30->stipple, stipple->stipple, 4 * 32);
+	nv30->dirty |= NV30_NEW_STIPPLE;
+}
+
+static void
+nv30_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->scissor = *s;
+	nv30->dirty |= NV30_NEW_SCISSOR;
+}
+
+static void
+nv30_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->viewport = *vpt;
+	nv30->dirty |= NV30_NEW_VIEWPORT;
+	/*nv30->draw_dirty |= NV30_NEW_VIEWPORT;*/
+}
+
+static void
+nv30_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	memcpy(nv30->vtxbuf, vb, sizeof(*vb) * count);
+	nv30->vtxbuf_nr = count;
+
+	nv30->dirty |= NV30_NEW_ARRAYS;
+	/*nv30->draw_dirty |= NV30_NEW_ARRAYS;*/
+}
+
+static void
+nv30_set_vertex_elements(struct pipe_context *pipe, unsigned count,
+			 const struct pipe_vertex_element *ve)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	memcpy(nv30->vtxelt, ve, sizeof(*ve) * count);
+	nv30->vtxelt_nr = count;
+
+	nv30->dirty |= NV30_NEW_ARRAYS;
+	/*nv30->draw_dirty |= NV30_NEW_ARRAYS;*/
+}
+
+static void
+nv30_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->edgeflags = bitfield;
+	nv30->dirty |= NV30_NEW_ARRAYS;
+	/*nv30->draw_dirty |= NV30_NEW_ARRAYS;*/
+}
+
+void
+nv30_init_state_functions(struct nv30_context *nv30)
+{
+	nv30->pipe.create_blend_state = nv30_blend_state_create;
+	nv30->pipe.bind_blend_state = nv30_blend_state_bind;
+	nv30->pipe.delete_blend_state = nv30_blend_state_delete;
+
+	nv30->pipe.create_sampler_state = nv30_sampler_state_create;
+	nv30->pipe.bind_sampler_states = nv30_sampler_state_bind;
+	nv30->pipe.delete_sampler_state = nv30_sampler_state_delete;
+	nv30->pipe.set_sampler_textures = nv30_set_sampler_texture;
+
+	nv30->pipe.create_rasterizer_state = nv30_rasterizer_state_create;
+	nv30->pipe.bind_rasterizer_state = nv30_rasterizer_state_bind;
+	nv30->pipe.delete_rasterizer_state = nv30_rasterizer_state_delete;
+
+	nv30->pipe.create_depth_stencil_alpha_state =
+		nv30_depth_stencil_alpha_state_create;
+	nv30->pipe.bind_depth_stencil_alpha_state =
+		nv30_depth_stencil_alpha_state_bind;
+	nv30->pipe.delete_depth_stencil_alpha_state =
+		nv30_depth_stencil_alpha_state_delete;
+
+	nv30->pipe.create_vs_state = nv30_vp_state_create;
+	nv30->pipe.bind_vs_state = nv30_vp_state_bind;
+	nv30->pipe.delete_vs_state = nv30_vp_state_delete;
+
+	nv30->pipe.create_fs_state = nv30_fp_state_create;
+	nv30->pipe.bind_fs_state = nv30_fp_state_bind;
+	nv30->pipe.delete_fs_state = nv30_fp_state_delete;
+
+	nv30->pipe.set_blend_color = nv30_set_blend_color;
+	nv30->pipe.set_clip_state = nv30_set_clip_state;
+	nv30->pipe.set_constant_buffer = nv30_set_constant_buffer;
+	nv30->pipe.set_framebuffer_state = nv30_set_framebuffer_state;
+	nv30->pipe.set_polygon_stipple = nv30_set_polygon_stipple;
+	nv30->pipe.set_scissor_state = nv30_set_scissor_state;
+	nv30->pipe.set_viewport_state = nv30_set_viewport_state;
+
+	nv30->pipe.set_edgeflags = nv30_set_edgeflags;
+	nv30->pipe.set_vertex_buffers = nv30_set_vertex_buffers;
+	nv30->pipe.set_vertex_elements = nv30_set_vertex_elements;
+}
+
diff --git a/src/gallium/drivers/nv30/nv30_state.h b/src/gallium/drivers/nv30/nv30_state.h
new file mode 100644
index 00000000000..e6f23bf1667
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state.h
@@ -0,0 +1,85 @@
+#ifndef __NV30_STATE_H__
+#define __NV30_STATE_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nv30_sampler_state {
+	uint32_t fmt;
+	uint32_t wrap;
+	uint32_t en;
+	uint32_t filt;
+	uint32_t bcol;
+};
+
+struct nv30_vertex_program_exec {
+	uint32_t data[4];
+	boolean has_branch_offset;
+	int const_index;
+};
+
+struct nv30_vertex_program_data {
+	int index; /* immediates == -1 */
+	float value[4];
+};
+
+struct nv30_vertex_program {
+	struct pipe_shader_state pipe;
+
+	boolean translated;
+
+	struct nv30_vertex_program_exec *insns;
+	unsigned nr_insns;
+	struct nv30_vertex_program_data *consts;
+	unsigned nr_consts;
+
+	struct nouveau_resource *exec;
+	unsigned exec_start;
+	struct nouveau_resource *data;
+	unsigned data_start;
+	unsigned data_start_min;
+
+	uint32_t ir;
+	uint32_t or;
+	struct nouveau_stateobj *so;
+};
+
+struct nv30_fragment_program_data {
+	unsigned offset;
+	unsigned index;
+};
+
+struct nv30_fragment_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+
+	boolean translated;
+	boolean on_hw;
+	unsigned samplers;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct nv30_fragment_program_data *consts;
+	unsigned nr_consts;
+
+	struct pipe_buffer *buffer;
+
+	uint32_t fp_control;
+	uint32_t fp_reg_control;
+	struct nouveau_stateobj *so;
+};
+
+struct nv30_miptree {
+	struct pipe_texture base;
+
+	struct pipe_buffer *buffer;
+	uint total_size;
+
+	struct {
+		uint pitch;
+		uint *image_offset;
+	} level[PIPE_MAX_TEXTURE_LEVELS];
+};
+
+#endif
diff --git a/src/gallium/drivers/nv30/nv30_state_blend.c b/src/gallium/drivers/nv30/nv30_state_blend.c
new file mode 100644
index 00000000000..44d43e132a5
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_blend.c
@@ -0,0 +1,40 @@
+#include "nv30_context.h"
+
+static boolean
+nv30_state_blend_validate(struct nv30_context *nv30)
+{
+	so_ref(nv30->blend->so, &nv30->state.hw[NV30_STATE_BLEND]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_blend = {
+	.validate = nv30_state_blend_validate,
+	.dirty = {
+		.pipe = NV30_NEW_BLEND,
+		.hw = NV30_STATE_BLEND
+	}
+};
+
+static boolean
+nv30_state_blend_colour_validate(struct nv30_context *nv30)
+{
+	struct nouveau_stateobj *so = so_new(2, 0);
+	struct pipe_blend_color *bcol = &nv30->blend_colour;
+
+	so_method(so, nv30->screen->rankine, NV34TCL_BLEND_COLOR, 1);
+	so_data  (so, ((float_to_ubyte(bcol->color[3]) << 24) |
+		       (float_to_ubyte(bcol->color[0]) << 16) |
+		       (float_to_ubyte(bcol->color[1]) <<  8) |
+		       (float_to_ubyte(bcol->color[2]) <<  0)));
+
+	so_ref(so, &nv30->state.hw[NV30_STATE_BCOL]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_blend_colour = {
+	.validate = nv30_state_blend_colour_validate,
+	.dirty = {
+		.pipe = NV30_NEW_BCOL,
+		.hw = NV30_STATE_BCOL
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_state_emit.c b/src/gallium/drivers/nv30/nv30_state_emit.c
new file mode 100644
index 00000000000..40fed621b24
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_emit.c
@@ -0,0 +1,118 @@
+#include "nv30_context.h"
+#include "nv30_state.h"
+
+static struct nv30_state_entry *render_states[] = {
+	&nv30_state_framebuffer,
+	&nv30_state_rasterizer,
+	&nv30_state_scissor,
+	&nv30_state_stipple,
+	&nv30_state_fragprog,
+	&nv30_state_fragtex,
+	&nv30_state_vertprog,
+	&nv30_state_blend,
+	&nv30_state_blend_colour,
+	&nv30_state_zsa,
+	&nv30_state_viewport,
+	&nv30_state_vbo,
+	NULL
+};
+
+static void
+nv30_state_do_validate(struct nv30_context *nv30,
+		       struct nv30_state_entry **states)
+{
+	const struct pipe_framebuffer_state *fb = &nv30->framebuffer;
+	unsigned i;
+
+	for (i = 0; i < fb->num_cbufs; i++)
+		fb->cbufs[i]->status = PIPE_SURFACE_STATUS_DEFINED;
+	if (fb->zsbuf)
+		fb->zsbuf->status = PIPE_SURFACE_STATUS_DEFINED;
+
+	while (*states) {
+		struct nv30_state_entry *e = *states;
+
+		if (nv30->dirty & e->dirty.pipe) {
+			if (e->validate(nv30)) {
+				nv30->state.dirty |= (1ULL << e->dirty.hw);
+			}
+		}
+
+		states++;
+	}
+	nv30->dirty = 0;
+}
+
+void
+nv30_state_emit(struct nv30_context *nv30)
+{
+	struct nv30_state *state = &nv30->state;
+	struct nv30_screen *screen = nv30->screen;
+	unsigned i, samplers;
+	uint64 states;
+
+	if (nv30->pctx_id != screen->cur_pctx) {
+		for (i = 0; i < NV30_STATE_MAX; i++) {
+			if (state->hw[i] && screen->state[i] != state->hw[i])
+				state->dirty |= (1ULL << i);
+		}
+
+		screen->cur_pctx = nv30->pctx_id;
+	}
+
+	for (i = 0, states = state->dirty; states; i++) {
+		if (!(states & (1ULL << i)))
+			continue;
+		so_ref (state->hw[i], &nv30->screen->state[i]);
+		if (state->hw[i])
+			so_emit(nv30->nvws, nv30->screen->state[i]);
+		states &= ~(1ULL << i);
+	}
+
+	state->dirty = 0;
+
+	so_emit_reloc_markers(nv30->nvws, state->hw[NV30_STATE_FB]);
+	for (i = 0, samplers = state->fp_samplers; i < 16 && samplers; i++) {
+		if (!(samplers & (1 << i)))
+			continue;
+		so_emit_reloc_markers(nv30->nvws,
+				      state->hw[NV30_STATE_FRAGTEX0+i]);
+		samplers &= ~(1ULL << i);
+	}
+	so_emit_reloc_markers(nv30->nvws, state->hw[NV30_STATE_FRAGPROG]);
+	if (state->hw[NV30_STATE_VTXBUF] /*&& nv30->render_mode == HW*/)
+		so_emit_reloc_markers(nv30->nvws, state->hw[NV30_STATE_VTXBUF]);
+}
+
+boolean
+nv30_state_validate(struct nv30_context *nv30)
+{
+#if 0
+	boolean was_sw = nv30->fallback_swtnl ? TRUE : FALSE;
+
+	if (nv30->render_mode != HW) {
+		/* Don't even bother trying to go back to hw if none
+		 * of the states that caused swtnl previously have changed.
+		 */
+		if ((nv30->fallback_swtnl & nv30->dirty)
+				!= nv30->fallback_swtnl)
+			return FALSE;
+
+		/* Attempt to go to hwtnl again */
+		nv30->pipe.flush(&nv30->pipe, 0, NULL);
+		nv30->dirty |= (NV30_NEW_VIEWPORT |
+				NV30_NEW_VERTPROG |
+				NV30_NEW_ARRAYS);
+		nv30->render_mode = HW;
+	}
+#endif
+	nv30_state_do_validate(nv30, render_states);
+#if 0
+	if (nv30->fallback_swtnl || nv30->fallback_swrast)
+		return FALSE;
+	
+	if (was_sw)
+		NOUVEAU_ERR("swtnl->hw\n");
+#endif
+	return TRUE;
+}
diff --git a/src/gallium/drivers/nv30/nv30_state_fb.c b/src/gallium/drivers/nv30/nv30_state_fb.c
new file mode 100644
index 00000000000..d93e2bb5c89
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_fb.c
@@ -0,0 +1,128 @@
+#include "nv30_context.h"
+
+static boolean
+nv30_state_framebuffer_validate(struct nv30_context *nv30)
+{
+	struct pipe_framebuffer_state *fb = &nv30->framebuffer;
+	struct pipe_surface *rt[2], *zeta = NULL;
+	uint32_t rt_enable, rt_format;
+	int i, colour_format = 0, zeta_format = 0;
+	struct nouveau_stateobj *so = so_new(64, 10);
+	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
+	unsigned w = fb->width;
+	unsigned h = fb->height;
+
+	rt_enable = 0;
+	for (i = 0; i < fb->num_cbufs; i++) {
+		if (colour_format) {
+			assert(colour_format == fb->cbufs[i]->format);
+		} else {
+			colour_format = fb->cbufs[i]->format;
+			rt_enable |= (NV34TCL_RT_ENABLE_COLOR0 << i);
+			rt[i] = fb->cbufs[i];
+		}
+	}
+
+	if (rt_enable & NV34TCL_RT_ENABLE_COLOR1)
+		rt_enable |= NV34TCL_RT_ENABLE_MRT;
+
+	if (fb->zsbuf) {
+		zeta_format = fb->zsbuf->format;
+		zeta = fb->zsbuf;
+	}
+
+	rt_format = NV34TCL_RT_FORMAT_TYPE_LINEAR;
+
+	switch (colour_format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case 0:
+		rt_format |= NV34TCL_RT_FORMAT_COLOR_A8R8G8B8;
+		break;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		rt_format |= NV34TCL_RT_FORMAT_COLOR_R5G6B5;
+		break;
+	default:
+		assert(0);
+	}
+
+	switch (zeta_format) {
+	case PIPE_FORMAT_Z16_UNORM:
+		rt_format |= NV34TCL_RT_FORMAT_ZETA_Z16;
+		break;
+	case PIPE_FORMAT_Z24S8_UNORM:
+	case 0:
+		rt_format |= NV34TCL_RT_FORMAT_ZETA_Z24S8;
+		break;
+	default:
+		assert(0);
+	}
+
+	if (rt_enable & NV34TCL_RT_ENABLE_COLOR0) {
+		uint32_t pitch = rt[0]->stride;
+		if (zeta) {
+			pitch |= (zeta->stride << 16);
+		} else {
+			pitch |= (pitch << 16);
+		}
+
+		so_method(so, nv30->screen->rankine, NV34TCL_DMA_COLOR0, 1);
+		so_reloc (so, rt[0]->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv30->nvws->channel->vram->handle,
+			  nv30->nvws->channel->gart->handle);
+		so_method(so, nv30->screen->rankine, NV34TCL_COLOR0_PITCH, 2);
+		so_data  (so, pitch);
+		so_reloc (so, rt[0]->buffer, rt[0]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+	}
+
+	if (rt_enable & NV34TCL_RT_ENABLE_COLOR1) {
+		so_method(so, nv30->screen->rankine, NV34TCL_DMA_COLOR1, 1);
+		so_reloc (so, rt[1]->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv30->nvws->channel->vram->handle,
+			  nv30->nvws->channel->gart->handle);
+		so_method(so, nv30->screen->rankine, NV34TCL_COLOR1_OFFSET, 2);
+		so_reloc (so, rt[1]->buffer, rt[1]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_data  (so, rt[1]->stride);
+	}
+
+	if (zeta_format) {
+		so_method(so, nv30->screen->rankine, NV34TCL_DMA_ZETA, 1);
+		so_reloc (so, zeta->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv30->nvws->channel->vram->handle,
+			  nv30->nvws->channel->gart->handle);
+		so_method(so, nv30->screen->rankine, NV34TCL_ZETA_OFFSET, 1);
+		so_reloc (so, zeta->buffer, zeta->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		/* TODO: allocate LMA depth buffer */
+	}
+
+	so_method(so, nv30->screen->rankine, NV34TCL_RT_ENABLE, 1);
+	so_data  (so, rt_enable);
+	so_method(so, nv30->screen->rankine, NV34TCL_RT_HORIZ, 3);
+	so_data  (so, (w << 16) | 0);
+	so_data  (so, (h << 16) | 0);
+	so_data  (so, rt_format);
+	so_method(so, nv30->screen->rankine, NV34TCL_VIEWPORT_HORIZ, 2);
+	so_data  (so, (w << 16) | 0);
+	so_data  (so, (h << 16) | 0);
+	so_method(so, nv30->screen->rankine, NV34TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	so_data  (so, ((w - 1) << 16) | 0);
+	so_data  (so, ((h - 1) << 16) | 0);
+	so_method(so, nv30->screen->rankine, 0x1d88, 1);
+	so_data  (so, (1 << 12) | h);
+	/* Wonder why this is needed, context should all be set to zero on init */
+	so_method(so, nv30->screen->rankine, NV34TCL_VIEWPORT_TX_ORIGIN, 1);
+	so_data  (so, 0);
+
+	so_ref(so, &nv30->state.hw[NV30_STATE_FB]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_framebuffer = {
+	.validate = nv30_state_framebuffer_validate,
+	.dirty = {
+		.pipe = NV30_NEW_FB,
+		.hw = NV30_STATE_FB
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_state_rasterizer.c b/src/gallium/drivers/nv30/nv30_state_rasterizer.c
new file mode 100644
index 00000000000..6d1b60e043d
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_rasterizer.c
@@ -0,0 +1,17 @@
+#include "nv30_context.h"
+
+static boolean
+nv30_state_rasterizer_validate(struct nv30_context *nv30)
+{
+	so_ref(nv30->rasterizer->so,
+	       &nv30->state.hw[NV30_STATE_RAST]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_rasterizer = {
+	.validate = nv30_state_rasterizer_validate,
+	.dirty = {
+		.pipe = NV30_NEW_RAST,
+		.hw = NV30_STATE_RAST
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_state_scissor.c b/src/gallium/drivers/nv30/nv30_state_scissor.c
new file mode 100644
index 00000000000..1db9bc17955
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_scissor.c
@@ -0,0 +1,35 @@
+#include "nv30_context.h"
+
+static boolean
+nv30_state_scissor_validate(struct nv30_context *nv30)
+{
+	struct pipe_rasterizer_state *rast = &nv30->rasterizer->pipe;
+	struct pipe_scissor_state *s = &nv30->scissor;
+	struct nouveau_stateobj *so;
+
+	if (nv30->state.hw[NV30_STATE_SCISSOR] &&
+	    (rast->scissor == 0 && nv30->state.scissor_enabled == 0))
+		return FALSE;
+	nv30->state.scissor_enabled = rast->scissor;
+
+	so = so_new(3, 0);
+	so_method(so, nv30->screen->rankine, NV34TCL_SCISSOR_HORIZ, 2);
+	if (nv30->state.scissor_enabled) {
+		so_data  (so, ((s->maxx - s->minx) << 16) | s->minx);
+		so_data  (so, ((s->maxy - s->miny) << 16) | s->miny);
+	} else {
+		so_data  (so, 4096 << 16);
+		so_data  (so, 4096 << 16);
+	}
+
+	so_ref(so, &nv30->state.hw[NV30_STATE_SCISSOR]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_scissor = {
+	.validate = nv30_state_scissor_validate,
+	.dirty = {
+		.pipe = NV30_NEW_SCISSOR | NV30_NEW_RAST,
+		.hw = NV30_STATE_SCISSOR
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_state_stipple.c b/src/gallium/drivers/nv30/nv30_state_stipple.c
new file mode 100644
index 00000000000..41b42813b49
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_stipple.c
@@ -0,0 +1,39 @@
+#include "nv30_context.h"
+
+static boolean
+nv30_state_stipple_validate(struct nv30_context *nv30)
+{
+	struct pipe_rasterizer_state *rast = &nv30->rasterizer->pipe;
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+	struct nouveau_stateobj *so;
+
+	if (nv30->state.hw[NV30_STATE_STIPPLE] &&
+	   (rast->poly_stipple_enable == 0 && nv30->state.stipple_enabled == 0))
+		return FALSE;
+
+	if (rast->poly_stipple_enable) {
+		unsigned i;
+
+		so = so_new(35, 0);
+		so_method(so, rankine, NV34TCL_POLYGON_STIPPLE_ENABLE, 1);
+		so_data  (so, 1);
+		so_method(so, rankine, NV34TCL_POLYGON_STIPPLE_PATTERN(0), 32);
+		for (i = 0; i < 32; i++)
+			so_data(so, nv30->stipple[i]);
+	} else {
+		so = so_new(2, 0);
+		so_method(so, rankine, NV34TCL_POLYGON_STIPPLE_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_ref(so, &nv30->state.hw[NV30_STATE_STIPPLE]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_stipple = {
+	.validate = nv30_state_stipple_validate,
+	.dirty = {
+		.pipe = NV30_NEW_STIPPLE | NV30_NEW_RAST,
+		.hw = NV30_STATE_STIPPLE,
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_state_viewport.c b/src/gallium/drivers/nv30/nv30_state_viewport.c
new file mode 100644
index 00000000000..951d40ebfdd
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_viewport.c
@@ -0,0 +1,70 @@
+#include "nv30_context.h"
+
+static boolean
+nv30_state_viewport_validate(struct nv30_context *nv30)
+{
+	struct pipe_viewport_state *vpt = &nv30->viewport;
+	struct nouveau_stateobj *so;
+	unsigned bypass;
+
+	if (/*nv30->render_mode == HW &&*/ !nv30->rasterizer->pipe.bypass_clipping)
+		bypass = 0;
+	else
+		bypass = 1;
+
+	if (nv30->state.hw[NV30_STATE_VIEWPORT] &&
+	    (bypass || !(nv30->dirty & NV30_NEW_VIEWPORT)) &&
+	    nv30->state.viewport_bypass == bypass)
+		return FALSE;
+	nv30->state.viewport_bypass = bypass;
+
+	so = so_new(11, 0);
+	if (!bypass) {
+		so_method(so, nv30->screen->rankine,
+			  NV34TCL_VIEWPORT_TRANSLATE_X, 8);
+		so_data  (so, fui(vpt->translate[0]));
+		so_data  (so, fui(vpt->translate[1]));
+		so_data  (so, fui(vpt->translate[2]));
+		so_data  (so, fui(vpt->translate[3]));
+		so_data  (so, fui(vpt->scale[0]));
+		so_data  (so, fui(vpt->scale[1]));
+		so_data  (so, fui(vpt->scale[2]));
+		so_data  (so, fui(vpt->scale[3]));
+/*		so_method(so, nv30->screen->rankine, 0x1d78, 1);
+		so_data  (so, 1);
+*/	} else {
+		so_method(so, nv30->screen->rankine,
+			  NV34TCL_VIEWPORT_TRANSLATE_X, 8);
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(0.0));
+		/* Not entirely certain what this is yet.  The DDX uses this
+		 * value also as it fixes rendering when you pass
+		 * pre-transformed vertices to the GPU.  My best gusss is that
+		 * this bypasses some culling/clipping stage.  Might be worth
+		 * noting that points/lines are uneffected by whatever this
+		 * value fixes, only filled polygons are effected.
+		 */
+/*		so_method(so, nv30->screen->rankine, 0x1d78, 1);
+		so_data  (so, 0x110);
+*/	}
+	/* TODO/FIXME: never saw value 0x0110 in renouveau dumps, only 0x0001 */
+	so_method(so, nv30->screen->rankine, 0x1d78, 1);
+	so_data  (so, 1);
+
+	so_ref(so, &nv30->state.hw[NV30_STATE_VIEWPORT]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_viewport = {
+	.validate = nv30_state_viewport_validate,
+	.dirty = {
+		.pipe = NV30_NEW_VIEWPORT | NV30_NEW_RAST,
+		.hw = NV30_STATE_VIEWPORT
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_state_zsa.c b/src/gallium/drivers/nv30/nv30_state_zsa.c
new file mode 100644
index 00000000000..0940b7269b2
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_zsa.c
@@ -0,0 +1,17 @@
+#include "nv30_context.h"
+
+static boolean
+nv30_state_zsa_validate(struct nv30_context *nv30)
+{
+	so_ref(nv30->zsa->so,
+	       &nv30->state.hw[NV30_STATE_ZSA]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_zsa = {
+	.validate = nv30_state_zsa_validate,
+	.dirty = {
+		.pipe = NV30_NEW_ZSA,
+		.hw = NV30_STATE_ZSA
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_surface.c b/src/gallium/drivers/nv30/nv30_surface.c
new file mode 100644
index 00000000000..36f48877503
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_surface.c
@@ -0,0 +1,77 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "nv30_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
+
+#include "util/u_tile.h"
+
+static void
+nv30_surface_copy(struct pipe_context *pipe, unsigned do_flip,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nouveau_winsys *nvws = nv30->nvws;
+
+	if (do_flip) {
+		/*XXX: This dodgyness will do for now for correctness.  But,
+		 *     need to investigate whether the 2D engine is able to
+		 *     manage a flip (perhaps SIFM?), if not, use the 3D engine
+		 */
+		desty += height;
+		while (height--) {
+			nvws->surface_copy(nvws, dest, destx, desty--, src,
+					   srcx, srcy++, width, 1);
+		}
+	} else {
+		nvws->surface_copy(nvws, dest, destx, desty, src, srcx, srcy,
+				   width, height);
+	}
+}
+
+static void
+nv30_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nouveau_winsys *nvws = nv30->nvws;
+
+	nvws->surface_fill(nvws, dest, destx, desty, width, height, value);
+}
+
+void
+nv30_init_surface_functions(struct nv30_context *nv30)
+{
+	nv30->pipe.surface_copy = nv30_surface_copy;
+	nv30->pipe.surface_fill = nv30_surface_fill;
+}
diff --git a/src/gallium/drivers/nv30/nv30_vbo.c b/src/gallium/drivers/nv30/nv30_vbo.c
new file mode 100644
index 00000000000..556f981d4a5
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_vbo.c
@@ -0,0 +1,556 @@
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "nv30_context.h"
+#include "nv30_state.h"
+
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_pushbuf.h"
+#include "nouveau/nouveau_util.h"
+
+#define FORCE_SWTNL 0
+
+static INLINE int
+nv30_vbo_format_to_hw(enum pipe_format pipe, unsigned *fmt, unsigned *ncomp)
+{
+	switch (pipe) {
+	case PIPE_FORMAT_R32_FLOAT:
+	case PIPE_FORMAT_R32G32_FLOAT:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+		*fmt = NV34TCL_VTXFMT_TYPE_FLOAT;
+		break;
+	case PIPE_FORMAT_R8_UNORM:
+	case PIPE_FORMAT_R8G8_UNORM:
+	case PIPE_FORMAT_R8G8B8_UNORM:
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+		*fmt = NV34TCL_VTXFMT_TYPE_UBYTE;
+		break;
+	case PIPE_FORMAT_R16_SSCALED:
+	case PIPE_FORMAT_R16G16_SSCALED:
+	case PIPE_FORMAT_R16G16B16_SSCALED:
+	case PIPE_FORMAT_R16G16B16A16_SSCALED:
+		*fmt = NV34TCL_VTXFMT_TYPE_USHORT;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown format %s\n", pf_name(pipe));
+		return 1;
+	}
+
+	switch (pipe) {
+	case PIPE_FORMAT_R8_UNORM:
+	case PIPE_FORMAT_R32_FLOAT:
+	case PIPE_FORMAT_R16_SSCALED:
+		*ncomp = 1;
+		break;
+	case PIPE_FORMAT_R8G8_UNORM:
+	case PIPE_FORMAT_R32G32_FLOAT:
+	case PIPE_FORMAT_R16G16_SSCALED:
+		*ncomp = 2;
+		break;
+	case PIPE_FORMAT_R8G8B8_UNORM:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+	case PIPE_FORMAT_R16G16B16_SSCALED:
+		*ncomp = 3;
+		break;
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+	case PIPE_FORMAT_R16G16B16A16_SSCALED:
+		*ncomp = 4;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown format %s\n", pf_name(pipe));
+		return 1;
+	}
+
+	return 0;
+}
+
+static boolean
+nv30_vbo_set_idxbuf(struct nv30_context *nv30, struct pipe_buffer *ib,
+		    unsigned ib_size)
+{
+	struct pipe_screen *pscreen = &nv30->screen->pipe;
+	unsigned type;
+
+	if (!ib) {
+		nv30->idxbuf = NULL;
+		nv30->idxbuf_format = 0xdeadbeef;
+		return FALSE;
+	}
+
+	if (!pscreen->get_param(pscreen, NOUVEAU_CAP_HW_IDXBUF) || ib_size == 1)
+		return FALSE;
+
+	switch (ib_size) {
+	case 2:
+		type = NV34TCL_IDXBUF_FORMAT_TYPE_U16;
+		break;
+	case 4:
+		type = NV34TCL_IDXBUF_FORMAT_TYPE_U32;
+		break;
+	default:
+		return FALSE;
+	}
+
+	if (ib != nv30->idxbuf ||
+	    type != nv30->idxbuf_format) {
+		nv30->dirty |= NV30_NEW_ARRAYS;
+		nv30->idxbuf = ib;
+		nv30->idxbuf_format = type;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv30_vbo_static_attrib(struct nv30_context *nv30, struct nouveau_stateobj *so,
+		       int attrib, struct pipe_vertex_element *ve,
+		       struct pipe_vertex_buffer *vb)
+{
+	struct pipe_winsys *ws = nv30->pipe.winsys;
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+	unsigned type, ncomp;
+	void *map;
+
+	if (nv30_vbo_format_to_hw(ve->src_format, &type, &ncomp))
+		return FALSE;
+
+	map  = ws->buffer_map(ws, vb->buffer, PIPE_BUFFER_USAGE_CPU_READ);
+	map += vb->buffer_offset + ve->src_offset;
+
+	switch (type) {
+	case NV34TCL_VTXFMT_TYPE_FLOAT:
+	{
+		float *v = map;
+
+		switch (ncomp) {
+		case 4:
+			so_method(so, rankine, NV34TCL_VTX_ATTR_4F_X(attrib), 4);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			so_data  (so, fui(v[2]));
+			so_data  (so, fui(v[3]));
+			break;
+		case 3:
+			so_method(so, rankine, NV34TCL_VTX_ATTR_3F_X(attrib), 3);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			so_data  (so, fui(v[2]));
+			break;
+		case 2:
+			so_method(so, rankine, NV34TCL_VTX_ATTR_2F_X(attrib), 2);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			break;
+		case 1:
+			so_method(so, rankine, NV34TCL_VTX_ATTR_1F(attrib), 1);
+			so_data  (so, fui(v[0]));
+			break;
+		default:
+			ws->buffer_unmap(ws, vb->buffer);
+			return FALSE;
+		}
+	}
+		break;
+	default:
+		ws->buffer_unmap(ws, vb->buffer);
+		return FALSE;
+	}
+
+	ws->buffer_unmap(ws, vb->buffer);
+
+	return TRUE;
+}
+
+boolean
+nv30_draw_arrays(struct pipe_context *pipe,
+		 unsigned mode, unsigned start, unsigned count)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nouveau_channel *chan = nv30->nvws->channel;
+	unsigned restart = 0;
+
+	nv30_vbo_set_idxbuf(nv30, NULL, 0);
+	if (FORCE_SWTNL || !nv30_state_validate(nv30)) {
+		/*return nv30_draw_elements_swtnl(pipe, NULL, 0,
+						mode, start, count);*/
+		return FALSE;
+	}
+
+	while (count) {
+		unsigned vc, nr;
+
+		nv30_state_emit(nv30);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
+					mode, start, count, &restart);
+		if (!vc) {
+			FIRE_RING(NULL);
+			continue;
+		}
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		nr = (vc & 0xff);
+		if (nr) {
+			BEGIN_RING(rankine, NV34TCL_VB_VERTEX_BATCH, 1);
+			OUT_RING  (((nr - 1) << 24) | start);
+			start += nr;
+		}
+
+		nr = vc >> 8;
+		while (nr) {
+			unsigned push = nr > 2047 ? 2047 : nr;
+
+			nr -= push;
+
+			BEGIN_RING_NI(rankine, NV34TCL_VB_VERTEX_BATCH, push);
+			while (push--) {
+				OUT_RING(((0x100 - 1) << 24) | start);
+				start += 0x100;
+			}
+		}
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		count -= vc;
+		start = restart;
+	}
+
+	pipe->flush(pipe, 0, NULL);
+	return TRUE;
+}
+
+static INLINE void
+nv30_draw_elements_u08(struct nv30_context *nv30, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv30->nvws->channel;
+
+	while (count) {
+		uint8_t *elts = (uint8_t *)ib + start;
+		unsigned vc, push, restart = 0;
+
+		nv30_state_emit(nv30);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		if (vc & 1) {
+			BEGIN_RING(rankine, NV34TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (elts[0]);
+			elts++; vc--;
+		}
+
+		while (vc) {
+			unsigned i;
+
+			push = MIN2(vc, 2047 * 2);
+
+			BEGIN_RING_NI(rankine, NV34TCL_VB_ELEMENT_U16, push >> 1);
+			for (i = 0; i < push; i+=2)
+				OUT_RING((elts[i+1] << 16) | elts[i]);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static INLINE void
+nv30_draw_elements_u16(struct nv30_context *nv30, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv30->nvws->channel;
+
+	while (count) {
+		uint16_t *elts = (uint16_t *)ib + start;
+		unsigned vc, push, restart = 0;
+
+		nv30_state_emit(nv30);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		if (vc & 1) {
+			BEGIN_RING(rankine, NV34TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (elts[0]);
+			elts++; vc--;
+		}
+
+		while (vc) {
+			unsigned i;
+
+			push = MIN2(vc, 2047 * 2);
+
+			BEGIN_RING_NI(rankine, NV34TCL_VB_ELEMENT_U16, push >> 1);
+			for (i = 0; i < push; i+=2)
+				OUT_RING((elts[i+1] << 16) | elts[i]);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static INLINE void
+nv30_draw_elements_u32(struct nv30_context *nv30, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv30->nvws->channel;
+
+	while (count) {
+		uint32_t *elts = (uint32_t *)ib + start;
+		unsigned vc, push, restart = 0;
+
+		nv30_state_emit(nv30);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 5, 1,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		while (vc) {
+			push = MIN2(vc, 2047);
+
+			BEGIN_RING_NI(rankine, NV34TCL_VB_ELEMENT_U32, push);
+			OUT_RINGp    (elts, push);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static boolean
+nv30_draw_elements_inline(struct pipe_context *pipe,
+			  struct pipe_buffer *ib, unsigned ib_size,
+			  unsigned mode, unsigned start, unsigned count)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+	void *map;
+
+	map = ws->buffer_map(ws, ib, PIPE_BUFFER_USAGE_CPU_READ);
+	if (!ib) {
+		NOUVEAU_ERR("failed mapping ib\n");
+		return FALSE;
+	}
+
+	switch (ib_size) {
+	case 1:
+		nv30_draw_elements_u08(nv30, map, mode, start, count);
+		break;
+	case 2:
+		nv30_draw_elements_u16(nv30, map, mode, start, count);
+		break;
+	case 4:
+		nv30_draw_elements_u32(nv30, map, mode, start, count);
+		break;
+	default:
+		NOUVEAU_ERR("invalid idxbuf fmt %d\n", ib_size);
+		break;
+	}
+
+	ws->buffer_unmap(ws, ib);
+	return TRUE;
+}
+
+static boolean
+nv30_draw_elements_vbo(struct pipe_context *pipe,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nouveau_channel *chan = nv30->nvws->channel;
+	unsigned restart = 0;
+
+	while (count) {
+		unsigned nr, vc;
+
+		nv30_state_emit(nv30);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
+					mode, start, count, &restart);
+		if (!vc) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		nr = (vc & 0xff);
+		if (nr) {
+			BEGIN_RING(rankine, NV34TCL_VB_INDEX_BATCH, 1);
+			OUT_RING  (((nr - 1) << 24) | start);
+			start += nr;
+		}
+
+		nr = vc >> 8;
+		while (nr) {
+			unsigned push = nr > 2047 ? 2047 : nr;
+
+			nr -= push;
+
+			BEGIN_RING_NI(rankine, NV34TCL_VB_INDEX_BATCH, push);
+			while (push--) {
+				OUT_RING(((0x100 - 1) << 24) | start);
+				start += 0x100;
+			}
+		}
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		count -= vc;
+		start = restart;
+	}
+
+	return TRUE;
+}
+
+boolean
+nv30_draw_elements(struct pipe_context *pipe,
+		   struct pipe_buffer *indexBuffer, unsigned indexSize,
+		   unsigned mode, unsigned start, unsigned count)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	boolean idxbuf;
+
+	idxbuf = nv30_vbo_set_idxbuf(nv30, indexBuffer, indexSize);
+	if (FORCE_SWTNL || !nv30_state_validate(nv30)) {
+		/*return nv30_draw_elements_swtnl(pipe, NULL, 0,
+						mode, start, count);*/
+		return FALSE;	
+	}
+
+	if (idxbuf) {
+		nv30_draw_elements_vbo(pipe, mode, start, count);
+	} else {
+		nv30_draw_elements_inline(pipe, indexBuffer, indexSize,
+					  mode, start, count);
+	}
+
+	pipe->flush(pipe, 0, NULL);
+	return TRUE;
+}
+
+static boolean
+nv30_vbo_validate(struct nv30_context *nv30)
+{
+	struct nouveau_stateobj *vtxbuf, *vtxfmt, *sattr = NULL;
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+	struct pipe_buffer *ib = nv30->idxbuf;
+	unsigned ib_format = nv30->idxbuf_format;
+	unsigned vb_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+	int hw;
+
+	if (nv30->edgeflags) {
+		/*nv30->fallback_swtnl |= NV30_NEW_ARRAYS;*/
+		return FALSE;
+	}
+
+	vtxbuf = so_new(20, 18);
+	so_method(vtxbuf, rankine, NV34TCL_VTXBUF_ADDRESS(0), nv30->vtxelt_nr);
+	vtxfmt = so_new(17, 0);
+	so_method(vtxfmt, rankine, NV34TCL_VTXFMT(0), nv30->vtxelt_nr);
+
+	for (hw = 0; hw < nv30->vtxelt_nr; hw++) {
+		struct pipe_vertex_element *ve;
+		struct pipe_vertex_buffer *vb;
+		unsigned type, ncomp;
+
+		ve = &nv30->vtxelt[hw];
+		vb = &nv30->vtxbuf[ve->vertex_buffer_index];
+
+		if (!vb->pitch) {
+			if (!sattr)
+				sattr = so_new(16 * 5, 0);
+
+			if (nv30_vbo_static_attrib(nv30, sattr, hw, ve, vb)) {
+				so_data(vtxbuf, 0);
+				so_data(vtxfmt, NV34TCL_VTXFMT_TYPE_FLOAT);
+				continue;
+			}
+		}
+
+		if (nv30_vbo_format_to_hw(ve->src_format, &type, &ncomp)) {
+			/*nv30->fallback_swtnl |= NV30_NEW_ARRAYS;*/
+			so_ref(NULL, &vtxbuf);
+			so_ref(NULL, &vtxfmt);
+			return FALSE;
+		}
+
+		so_reloc(vtxbuf, vb->buffer, vb->buffer_offset + ve->src_offset,
+			 vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+			 0, NV34TCL_VTXBUF_ADDRESS_DMA1);
+		so_data (vtxfmt, ((vb->pitch << NV34TCL_VTXFMT_STRIDE_SHIFT) |
+				  (ncomp << NV34TCL_VTXFMT_SIZE_SHIFT) | type));
+	}
+
+	if (ib) {
+		so_method(vtxbuf, rankine, NV34TCL_IDXBUF_ADDRESS, 2);
+		so_reloc (vtxbuf, ib, 0, vb_flags | NOUVEAU_BO_LOW, 0, 0);
+		so_reloc (vtxbuf, ib, ib_format, vb_flags | NOUVEAU_BO_OR,
+			  0, NV34TCL_IDXBUF_FORMAT_DMA1);
+	}
+
+	so_method(vtxbuf, rankine, 0x1710, 1);
+	so_data  (vtxbuf, 0);
+
+	so_ref(vtxbuf, &nv30->state.hw[NV30_STATE_VTXBUF]);
+	nv30->state.dirty |= (1ULL << NV30_STATE_VTXBUF);
+	so_ref(vtxfmt, &nv30->state.hw[NV30_STATE_VTXFMT]);
+	nv30->state.dirty |= (1ULL << NV30_STATE_VTXFMT);
+	so_ref(sattr, &nv30->state.hw[NV30_STATE_VTXATTR]);
+	nv30->state.dirty |= (1ULL << NV30_STATE_VTXATTR);
+	return FALSE;
+}
+
+struct nv30_state_entry nv30_state_vbo = {
+	.validate = nv30_vbo_validate,
+	.dirty = {
+		.pipe = NV30_NEW_ARRAYS,
+		.hw = 0,
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_vertprog.c b/src/gallium/drivers/nv30/nv30_vertprog.c
new file mode 100644
index 00000000000..72824559e8b
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_vertprog.c
@@ -0,0 +1,838 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_dump.h"
+
+#include "nv30_context.h"
+#include "nv30_state.h"
+
+/* TODO (at least...):
+ *  1. Indexed consts  + ARL
+ *  2. Arb. swz/negation
+ *  3. NV_vp11, NV_vp2, NV_vp3 features
+ *       - extra arith opcodes
+ *       - branching
+ *       - texture sampling
+ *       - indexed attribs
+ *       - indexed results
+ *  4. bugs
+ */
+
+#define SWZ_X 0
+#define SWZ_Y 1
+#define SWZ_Z 2
+#define SWZ_W 3
+#define MASK_X 8
+#define MASK_Y 4
+#define MASK_Z 2
+#define MASK_W 1
+#define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
+#define DEF_SCALE 0
+#define DEF_CTEST 0
+#include "nv30_shader.h"
+
+#define swz(s,x,y,z,w) nv30_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
+#define neg(s) nv30_sr_neg((s))
+#define abs(s) nv30_sr_abs((s))
+
+struct nv30_vpc {
+	struct nv30_vertex_program *vp;
+
+	struct nv30_vertex_program_exec *vpi;
+
+	unsigned output_map[PIPE_MAX_SHADER_OUTPUTS];
+
+	int high_temp;
+	int temp_temp_count;
+
+	struct nv30_sreg *imm;
+	unsigned nr_imm;
+};
+
+static struct nv30_sreg
+temp(struct nv30_vpc *vpc)
+{
+	int idx;
+
+	idx  = vpc->temp_temp_count++;
+	idx += vpc->high_temp + 1;
+	return nv30_sr(NV30SR_TEMP, idx);
+}
+
+static struct nv30_sreg
+constant(struct nv30_vpc *vpc, int pipe, float x, float y, float z, float w)
+{
+	struct nv30_vertex_program *vp = vpc->vp;
+	struct nv30_vertex_program_data *vpd;
+	int idx;
+
+	if (pipe >= 0) {
+		for (idx = 0; idx < vp->nr_consts; idx++) {
+			if (vp->consts[idx].index == pipe)
+				return nv30_sr(NV30SR_CONST, idx);
+		}
+	}
+
+	idx = vp->nr_consts++;
+	vp->consts = realloc(vp->consts, sizeof(*vpd) * vp->nr_consts);
+	vpd = &vp->consts[idx];
+
+	vpd->index = pipe;
+	vpd->value[0] = x;
+	vpd->value[1] = y;
+	vpd->value[2] = z;
+	vpd->value[3] = w;
+	return nv30_sr(NV30SR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nv30_vp_arith((cc), (s), NV30_VP_INST_##o, (d), (m), (s0), (s1), (s2))
+
+static void
+emit_src(struct nv30_vpc *vpc, uint32_t *hw, int pos, struct nv30_sreg src)
+{
+	struct nv30_vertex_program *vp = vpc->vp;
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NV30SR_TEMP:
+		sr |= (NV30_VP_SRC_REG_TYPE_TEMP << NV30_VP_SRC_REG_TYPE_SHIFT);
+		sr |= (src.index << NV30_VP_SRC_TEMP_SRC_SHIFT);
+		break;
+	case NV30SR_INPUT:
+		sr |= (NV30_VP_SRC_REG_TYPE_INPUT <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		vp->ir |= (1 << src.index);
+		hw[1] |= (src.index << NV30_VP_INST_INPUT_SRC_SHIFT);
+		break;
+	case NV30SR_CONST:
+		sr |= (NV30_VP_SRC_REG_TYPE_CONST <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		assert(vpc->vpi->const_index == -1 ||
+		       vpc->vpi->const_index == src.index);
+		vpc->vpi->const_index = src.index;
+		break;
+	case NV30SR_NONE:
+		sr |= (NV30_VP_SRC_REG_TYPE_INPUT <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NV30_VP_SRC_NEGATE;
+
+	if (src.abs)
+		hw[0] |= (1 << (21 + pos));
+
+	sr |= ((src.swz[0] << NV30_VP_SRC_SWZ_X_SHIFT) |
+	       (src.swz[1] << NV30_VP_SRC_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NV30_VP_SRC_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NV30_VP_SRC_SWZ_W_SHIFT));
+
+/*
+ * |VVV|
+ * d�.�b
+ *  \u/
+ *
+ */
+
+	switch (pos) {
+	case 0:
+		hw[1] |= ((sr & NV30_VP_SRC0_HIGH_MASK) >>
+			  NV30_VP_SRC0_HIGH_SHIFT) << NV30_VP_INST_SRC0H_SHIFT;
+		hw[2] |= (sr & NV30_VP_SRC0_LOW_MASK) <<
+			  NV30_VP_INST_SRC0L_SHIFT;
+		break;
+	case 1:
+		hw[2] |= sr << NV30_VP_INST_SRC1_SHIFT;
+		break;
+	case 2:
+		hw[2] |= ((sr & NV30_VP_SRC2_HIGH_MASK) >>
+			  NV30_VP_SRC2_HIGH_SHIFT) << NV30_VP_INST_SRC2H_SHIFT;
+		hw[3] |= (sr & NV30_VP_SRC2_LOW_MASK) <<
+			  NV30_VP_INST_SRC2L_SHIFT;
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+emit_dst(struct nv30_vpc *vpc, uint32_t *hw, int slot, struct nv30_sreg dst)
+{
+	struct nv30_vertex_program *vp = vpc->vp;
+
+	switch (dst.type) {
+	case NV30SR_TEMP:
+		hw[0] |= (dst.index << NV30_VP_INST_DEST_TEMP_ID_SHIFT);
+		break;
+	case NV30SR_OUTPUT:
+		switch (dst.index) {
+		case NV30_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
+		case NV30_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
+		case NV30_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
+		case NV30_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
+		case NV30_VP_INST_DEST_FOGC : vp->or |= (1 << 4); break;
+		case NV30_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
+		case NV30_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
+		case NV30_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
+		case NV30_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
+		case NV30_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
+		case NV30_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
+		case NV30_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
+		case NV30_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
+		case NV30_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
+		default:
+			break;
+		}
+
+		hw[3] |= (dst.index << NV30_VP_INST_DEST_SHIFT);
+		hw[0] |= NV30_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
+
+		/*XXX: no way this is entirely correct, someone needs to
+		 *     figure out what exactly it is.
+		 */
+		hw[3] |= 0x800;
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+nv30_vp_arith(struct nv30_vpc *vpc, int slot, int op,
+	      struct nv30_sreg dst, int mask,
+	      struct nv30_sreg s0, struct nv30_sreg s1,
+	      struct nv30_sreg s2)
+{
+	struct nv30_vertex_program *vp = vpc->vp;
+	uint32_t *hw;
+
+	vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
+	vpc->vpi = &vp->insns[vp->nr_insns - 1];
+	memset(vpc->vpi, 0, sizeof(*vpc->vpi));
+	vpc->vpi->const_index = -1;
+
+	hw = vpc->vpi->data;
+
+	hw[0] |= (NV30_VP_INST_COND_TR << NV30_VP_INST_COND_SHIFT);
+	hw[0] |= ((0 << NV30_VP_INST_COND_SWZ_X_SHIFT) |
+		  (1 << NV30_VP_INST_COND_SWZ_Y_SHIFT) |
+		  (2 << NV30_VP_INST_COND_SWZ_Z_SHIFT) |
+		  (3 << NV30_VP_INST_COND_SWZ_W_SHIFT));
+
+	hw[1] |= (op << NV30_VP_INST_VEC_OPCODE_SHIFT);
+//	hw[3] |= NV30_VP_INST_SCA_DEST_TEMP_MASK;
+//	hw[3] |= (mask << NV30_VP_INST_VEC_WRITEMASK_SHIFT);
+
+	if (dst.type == NV30SR_OUTPUT) {
+		if (slot)
+			hw[3] |= (mask << NV30_VP_INST_SDEST_WRITEMASK_SHIFT);
+		else
+			hw[3] |= (mask << NV30_VP_INST_VDEST_WRITEMASK_SHIFT);
+	} else {
+		if (slot)
+			hw[3] |= (mask << NV30_VP_INST_STEMP_WRITEMASK_SHIFT);
+		else
+			hw[3] |= (mask << NV30_VP_INST_VTEMP_WRITEMASK_SHIFT);
+	}
+
+	emit_dst(vpc, hw, slot, dst);
+	emit_src(vpc, hw, 0, s0);
+	emit_src(vpc, hw, 1, s1);
+	emit_src(vpc, hw, 2, s2);
+}
+
+static INLINE struct nv30_sreg
+tgsi_src(struct nv30_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
+	struct nv30_sreg src;
+
+	switch (fsrc->SrcRegister.File) {
+	case TGSI_FILE_INPUT:
+		src = nv30_sr(NV30SR_INPUT, fsrc->SrcRegister.Index);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(vpc, fsrc->SrcRegister.Index, 0, 0, 0, 0);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		src = vpc->imm[fsrc->SrcRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		if (vpc->high_temp < fsrc->SrcRegister.Index)
+			vpc->high_temp = fsrc->SrcRegister.Index;
+		src = nv30_sr(NV30SR_TEMP, fsrc->SrcRegister.Index);
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->SrcRegisterExtMod.Absolute;
+	src.negate = fsrc->SrcRegister.Negate;
+	src.swz[0] = fsrc->SrcRegister.SwizzleX;
+	src.swz[1] = fsrc->SrcRegister.SwizzleY;
+	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
+	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	return src;
+}
+
+static INLINE struct nv30_sreg
+tgsi_dst(struct nv30_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
+	struct nv30_sreg dst;
+
+	switch (fdst->DstRegister.File) {
+	case TGSI_FILE_OUTPUT:
+		dst = nv30_sr(NV30SR_OUTPUT,
+			      vpc->output_map[fdst->DstRegister.Index]);
+
+		break;
+	case TGSI_FILE_TEMPORARY:
+		dst = nv30_sr(NV30SR_TEMP, fdst->DstRegister.Index);
+		if (vpc->high_temp < dst.index)
+			vpc->high_temp = dst.index;
+		break;
+	default:
+		NOUVEAU_ERR("bad dst file\n");
+		break;
+	}
+
+	return dst;
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
+	return mask;
+}
+
+static boolean
+nv30_vertprog_parse_instruction(struct nv30_vpc *vpc,
+				const struct tgsi_full_instruction *finst)
+{
+	struct nv30_sreg src[3], dst, tmp;
+	struct nv30_sreg none = nv30_sr(NV30SR_NONE, 0);
+	int mask;
+	int ai = -1, ci = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	vpc->temp_temp_count = 0;
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(vpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
+				ai = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		/*XXX: index comparison is broken now that consts come from
+		 *     two different register files.
+		 */
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_IMMEDIATE:
+			if (ci == -1 || ci == fsrc->SrcRegister.Index) {
+				ci = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(vpc, &finst->FullDstRegisters[0]);
+	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(vpc, 0, OP_MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, src[1]);
+		break;
+	case TGSI_OPCODE_ARL:
+		arith(vpc, 0, OP_ARL, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(vpc, 0, OP_DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(vpc, 0, OP_DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		arith(vpc, 0, OP_DPH, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(vpc, 0, OP_DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(vpc, 1, OP_EX2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_EXP:
+		arith(vpc, 1, OP_EXP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(vpc, 0, OP_FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(vpc, 0, OP_FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(vpc, 1, OP_LG2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LIT:
+		arith(vpc, 1, OP_LIT, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LOG:
+		arith(vpc, 1, OP_LOG, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(vpc, 0, OP_MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(vpc, 0, OP_MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(vpc, 0, OP_MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(vpc, 0, OP_MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(vpc, 0, OP_MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_POW:
+		tmp = temp(vpc);
+		arith(vpc, 1, OP_LG2, tmp, MASK_X, none, none,
+		      swz(src[0], X, X, X, X));
+		arith(vpc, 0, OP_MUL, tmp, MASK_X, swz(tmp, X, X, X, X),
+		      swz(src[1], X, X, X, X), none);
+		arith(vpc, 1, OP_EX2, dst, mask, none, none,
+		      swz(tmp, X, X, X, X));
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(vpc, 1, OP_RCP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_RET:
+		break;
+	case TGSI_OPCODE_RSQ:
+		arith(vpc, 1, OP_RSQ, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(vpc, 0, OP_SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGT:
+		arith(vpc, 0, OP_SGT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(vpc, 0, OP_SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, neg(src[1]));
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(vpc);
+		arith(vpc, 0, OP_MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(vpc, 0, OP_MAD, dst, (mask & ~MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv30_vertprog_parse_decl_output(struct nv30_vpc *vpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	int hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NV30_VP_INST_DEST_POS;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV30_VP_INST_DEST_COL0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV30_VP_INST_DEST_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_BCOLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV30_VP_INST_DEST_BFC0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV30_VP_INST_DEST_BFC1;
+		} else {
+			NOUVEAU_ERR("bad bcolour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NV30_VP_INST_DEST_FOGC;
+		break;
+	case TGSI_SEMANTIC_PSIZE:
+		hw = NV30_VP_INST_DEST_PSZ;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.SemanticIndex <= 7) {
+			hw = NV30_VP_INST_DEST_TC(fdec->Semantic.SemanticIndex);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	vpc->output_map[fdec->DeclarationRange.First] = hw;
+	return TRUE;
+}
+
+static boolean
+nv30_vertprog_prepare(struct nv30_vpc *vpc)
+{
+	struct tgsi_parse_context p;
+	int nr_imm = 0;
+
+	tgsi_parse_init(&p, vpc->vp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+			nr_imm++;
+			break;
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	if (nr_imm) {
+		vpc->imm = CALLOC(nr_imm, sizeof(struct nv30_sreg));
+		assert(vpc->imm);
+	}
+
+	return TRUE;
+}
+
+static void
+nv30_vertprog_translate(struct nv30_context *nv30,
+			struct nv30_vertex_program *vp)
+{
+	struct tgsi_parse_context parse;
+	struct nv30_vpc *vpc = NULL;
+
+	tgsi_dump(vp->pipe.tokens,0);
+
+	vpc = CALLOC(1, sizeof(struct nv30_vpc));
+	if (!vpc)
+		return;
+	vpc->vp = vp;
+	vpc->high_temp = -1;
+
+	if (!nv30_vertprog_prepare(vpc)) {
+		FREE(vpc);
+		return;
+	}
+
+	tgsi_parse_init(&parse, vp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+			fdec = &parse.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_OUTPUT:
+				if (!nv30_vertprog_parse_decl_output(vpc, fdec))
+					goto out_err;
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			const struct tgsi_full_immediate *imm;
+
+			imm = &parse.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+//			assert(imm->Immediate.Size == 4);
+			vpc->imm[vpc->nr_imm++] =
+				constant(vpc, -1,
+					 imm->u.ImmediateFloat32[0].Float,
+					 imm->u.ImmediateFloat32[1].Float,
+					 imm->u.ImmediateFloat32[2].Float,
+					 imm->u.ImmediateFloat32[3].Float);
+		}
+			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+			finst = &parse.FullToken.FullInstruction;
+			if (!nv30_vertprog_parse_instruction(vpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	vp->insns[vp->nr_insns - 1].data[3] |= NV30_VP_INST_LAST;
+	vp->translated = TRUE;
+out_err:
+	tgsi_parse_free(&parse);
+	FREE(vpc);
+}
+
+static boolean
+nv30_vertprog_validate(struct nv30_context *nv30)
+{ 
+	struct nouveau_winsys *nvws = nv30->nvws;
+	struct pipe_winsys *ws = nv30->pipe.winsys;
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+	struct nv30_vertex_program *vp;
+	struct pipe_buffer *constbuf;
+	boolean upload_code = FALSE, upload_data = FALSE;
+	int i;
+
+	vp = nv30->vertprog;
+	constbuf = nv30->constbuf[PIPE_SHADER_VERTEX];
+
+	/* Translate TGSI shader into hw bytecode */
+	if (!vp->translated) {
+		nv30_vertprog_translate(nv30, vp);
+		if (!vp->translated)
+			return FALSE;
+	}
+
+	/* Allocate hw vtxprog exec slots */
+	if (!vp->exec) {
+		struct nouveau_resource *heap = nv30->screen->vp_exec_heap;
+		struct nouveau_stateobj *so;
+		uint vplen = vp->nr_insns;
+
+		if (nvws->res_alloc(heap, vplen, vp, &vp->exec)) {
+			while (heap->next && heap->size < vplen) {
+				struct nv30_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->exec);
+			}
+
+			if (nvws->res_alloc(heap, vplen, vp, &vp->exec))
+				assert(0);
+		}
+
+		so = so_new(2, 0);
+		so_method(so, rankine, NV34TCL_VP_START_FROM_ID, 1);
+		so_data  (so, vp->exec->start);
+		so_ref(so, &vp->so);
+
+		upload_code = TRUE;
+	}
+
+	/* Allocate hw vtxprog const slots */
+	if (vp->nr_consts && !vp->data) {
+		struct nouveau_resource *heap = nv30->screen->vp_data_heap;
+
+		if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data)) {
+			while (heap->next && heap->size < vp->nr_consts) {
+				struct nv30_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->data);
+			}
+
+			if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data))
+				assert(0);
+		}
+
+		/*XXX: handle this some day */
+		assert(vp->data->start >= vp->data_start_min);
+
+		upload_data = TRUE;
+		if (vp->data_start != vp->data->start)
+			upload_code = TRUE;
+	}
+
+	/* If exec or data segments moved we need to patch the program to
+	 * fixup offsets and register IDs.
+	 */
+	if (vp->exec_start != vp->exec->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv30_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->has_branch_offset) {
+				assert(0);
+			}
+		}
+
+		vp->exec_start = vp->exec->start;
+	}
+
+	if (vp->nr_consts && vp->data_start != vp->data->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv30_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->const_index >= 0) {
+				vpi->data[1] &= ~NV30_VP_INST_CONST_SRC_MASK;
+				vpi->data[1] |=
+					(vpi->const_index + vp->data->start) <<
+					NV30_VP_INST_CONST_SRC_SHIFT;
+
+			}
+		}
+
+		vp->data_start = vp->data->start;
+	}
+
+	/* Update + Upload constant values */
+	if (vp->nr_consts) {
+		float *map = NULL;
+
+		if (constbuf) {
+			map = ws->buffer_map(ws, constbuf,
+					     PIPE_BUFFER_USAGE_CPU_READ);
+		}
+
+		for (i = 0; i < vp->nr_consts; i++) {
+			struct nv30_vertex_program_data *vpd = &vp->consts[i];
+
+			if (vpd->index >= 0) {
+				if (!upload_data &&
+				    !memcmp(vpd->value, &map[vpd->index * 4],
+					    4 * sizeof(float)))
+					continue;
+				memcpy(vpd->value, &map[vpd->index * 4],
+				       4 * sizeof(float));
+			}
+
+			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_CONST_ID, 5);
+			OUT_RING  (i + vp->data->start);
+			OUT_RINGp ((uint32_t *)vpd->value, 4);
+		}
+
+		if (constbuf) {
+			ws->buffer_unmap(ws, constbuf);
+		}
+	}
+
+	/* Upload vtxprog */
+	if (upload_code) {
+#if 0
+		for (i = 0; i < vp->nr_insns; i++) {
+			NOUVEAU_MSG("VP inst %d: 0x%08x 0x%08x 0x%08x 0x%08x\n",
+				i, vp->insns[i].data[0], vp->insns[i].data[1],
+				vp->insns[i].data[2], vp->insns[i].data[3]);
+		}
+#endif
+		BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_FROM_ID, 1);
+		OUT_RING  (vp->exec->start);
+		for (i = 0; i < vp->nr_insns; i++) {
+			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_INST(0), 4);
+			OUT_RINGp (vp->insns[i].data, 4);
+		}
+	}
+
+	if (vp->so != nv30->state.hw[NV30_STATE_VERTPROG]) {
+		so_ref(vp->so, &nv30->state.hw[NV30_STATE_VERTPROG]);
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+void
+nv30_vertprog_destroy(struct nv30_context *nv30, struct nv30_vertex_program *vp)
+{
+	struct nouveau_winsys *nvws = nv30->screen->nvws;
+
+	vp->translated = FALSE;
+
+	if (vp->nr_insns) {
+		FREE(vp->insns);
+		vp->insns = NULL;
+		vp->nr_insns = 0;
+	}
+
+	if (vp->nr_consts) {
+		FREE(vp->consts);
+		vp->consts = NULL;
+		vp->nr_consts = 0;
+	}
+
+	nvws->res_free(&vp->exec);
+	vp->exec_start = 0;
+	nvws->res_free(&vp->data);
+	vp->data_start = 0;
+	vp->data_start_min = 0;
+
+	vp->ir = vp->or = 0;
+	so_ref(NULL, &vp->so);
+}
+
+struct nv30_state_entry nv30_state_vertprog = {
+	.validate = nv30_vertprog_validate,
+	.dirty = {
+		.pipe = NV30_NEW_VERTPROG /*| NV30_NEW_UCP*/,
+		.hw = NV30_STATE_VERTPROG,
+	}
+};
diff --git a/src/gallium/drivers/nv40/Makefile b/src/gallium/drivers/nv40/Makefile
new file mode 100644
index 00000000000..9c8eadf7e44
--- /dev/null
+++ b/src/gallium/drivers/nv40/Makefile
@@ -0,0 +1,37 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv40
+
+DRIVER_SOURCES = \
+	nv40_clear.c \
+	nv40_context.c \
+	nv40_draw.c \
+	nv40_fragprog.c \
+	nv40_fragtex.c \
+	nv40_miptree.c \
+	nv40_query.c \
+	nv40_screen.c \
+	nv40_state.c \
+	nv40_state_blend.c \
+	nv40_state_emit.c \
+	nv40_state_fb.c \
+	nv40_state_rasterizer.c \
+	nv40_state_scissor.c \
+	nv40_state_stipple.c \
+	nv40_state_viewport.c \
+	nv40_state_zsa.c \
+	nv40_surface.c \
+	nv40_vbo.c \
+	nv40_vertprog.c
+
+C_SOURCES = \
+	$(COMMON_SOURCES) \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/drivers/nv40/nv40_clear.c b/src/gallium/drivers/nv40/nv40_clear.c
new file mode 100644
index 00000000000..59efd620e32
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_clear.c
@@ -0,0 +1,13 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv40_context.h"
+
+void
+nv40_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, clearValue);
+	ps->status = PIPE_SURFACE_STATUS_CLEAR;
+}
diff --git a/src/gallium/drivers/nv40/nv40_context.c b/src/gallium/drivers/nv40/nv40_context.c
new file mode 100644
index 00000000000..cc63dd734bc
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_context.c
@@ -0,0 +1,72 @@
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+
+#include "nv40_context.h"
+#include "nv40_screen.h"
+
+static void
+nv40_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	
+	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
+		BEGIN_RING(curie, 0x1fd8, 1);
+		OUT_RING  (2);
+		BEGIN_RING(curie, 0x1fd8, 1);
+		OUT_RING  (1);
+	}
+
+	FIRE_RING(fence);
+}
+
+static void
+nv40_destroy(struct pipe_context *pipe)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	if (nv40->draw)
+		draw_destroy(nv40->draw);
+	FREE(nv40);
+}
+
+struct pipe_context *
+nv40_create(struct pipe_screen *pscreen, unsigned pctx_id)
+{
+	struct nv40_screen *screen = nv40_screen(pscreen);
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv40_context *nv40;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nv40 = CALLOC(1, sizeof(struct nv40_context));
+	if (!nv40)
+		return NULL;
+	nv40->screen = screen;
+	nv40->pctx_id = pctx_id;
+
+	nv40->nvws = nvws;
+
+	nv40->pipe.winsys = ws;
+	nv40->pipe.screen = pscreen;
+	nv40->pipe.destroy = nv40_destroy;
+	nv40->pipe.draw_arrays = nv40_draw_arrays;
+	nv40->pipe.draw_elements = nv40_draw_elements;
+	nv40->pipe.clear = nv40_clear;
+	nv40->pipe.flush = nv40_flush;
+
+	nv40_init_query_functions(nv40);
+	nv40_init_surface_functions(nv40);
+	nv40_init_state_functions(nv40);
+
+	/* Create, configure, and install fallback swtnl path */
+	nv40->draw = draw_create();
+	draw_wide_point_threshold(nv40->draw, 9999999.0);
+	draw_wide_line_threshold(nv40->draw, 9999999.0);
+	draw_enable_line_stipple(nv40->draw, FALSE);
+	draw_enable_point_sprites(nv40->draw, FALSE);
+	draw_set_rasterize_stage(nv40->draw, nv40_draw_render_stage(nv40));
+
+	return &nv40->pipe;
+}
+	
diff --git a/src/gallium/drivers/nv40/nv40_context.h b/src/gallium/drivers/nv40/nv40_context.h
new file mode 100644
index 00000000000..adcfbdd85a8
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_context.h
@@ -0,0 +1,233 @@
+#ifndef __NV40_CONTEXT_H__
+#define __NV40_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+
+#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
+	struct nv40_screen *ctx = nv40->screen
+#include "nouveau/nouveau_push.h"
+#include "nouveau/nouveau_stateobj.h"
+
+#include "nv40_state.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+enum nv40_state_index {
+	NV40_STATE_FB = 0,
+	NV40_STATE_VIEWPORT = 1,
+	NV40_STATE_BLEND = 2,
+	NV40_STATE_RAST = 3,
+	NV40_STATE_ZSA = 4,
+	NV40_STATE_BCOL = 5,
+	NV40_STATE_CLIP = 6,
+	NV40_STATE_SCISSOR = 7,
+	NV40_STATE_STIPPLE = 8,
+	NV40_STATE_FRAGPROG = 9,
+	NV40_STATE_VERTPROG = 10,
+	NV40_STATE_FRAGTEX0 = 11,
+	NV40_STATE_FRAGTEX1 = 12,
+	NV40_STATE_FRAGTEX2 = 13,
+	NV40_STATE_FRAGTEX3 = 14,
+	NV40_STATE_FRAGTEX4 = 15,
+	NV40_STATE_FRAGTEX5 = 16,
+	NV40_STATE_FRAGTEX6 = 17,
+	NV40_STATE_FRAGTEX7 = 18,
+	NV40_STATE_FRAGTEX8 = 19,
+	NV40_STATE_FRAGTEX9 = 20,
+	NV40_STATE_FRAGTEX10 = 21,
+	NV40_STATE_FRAGTEX11 = 22,
+	NV40_STATE_FRAGTEX12 = 23,
+	NV40_STATE_FRAGTEX13 = 24,
+	NV40_STATE_FRAGTEX14 = 25,
+	NV40_STATE_FRAGTEX15 = 26,
+	NV40_STATE_VERTTEX0 = 27,
+	NV40_STATE_VERTTEX1 = 28,
+	NV40_STATE_VERTTEX2 = 29,
+	NV40_STATE_VERTTEX3 = 30,
+	NV40_STATE_VTXBUF = 31,
+	NV40_STATE_VTXFMT = 32,
+	NV40_STATE_VTXATTR = 33,
+	NV40_STATE_MAX = 34
+};
+
+#include "nv40_screen.h"
+
+#define NV40_NEW_BLEND		(1 <<  0)
+#define NV40_NEW_RAST		(1 <<  1)
+#define NV40_NEW_ZSA		(1 <<  2)
+#define NV40_NEW_SAMPLER	(1 <<  3)
+#define NV40_NEW_FB		(1 <<  4)
+#define NV40_NEW_STIPPLE	(1 <<  5)
+#define NV40_NEW_SCISSOR	(1 <<  6)
+#define NV40_NEW_VIEWPORT	(1 <<  7)
+#define NV40_NEW_BCOL		(1 <<  8)
+#define NV40_NEW_VERTPROG	(1 <<  9)
+#define NV40_NEW_FRAGPROG	(1 << 10)
+#define NV40_NEW_ARRAYS		(1 << 11)
+#define NV40_NEW_UCP		(1 << 12)
+
+struct nv40_rasterizer_state {
+	struct pipe_rasterizer_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv40_zsa_state {
+	struct pipe_depth_stencil_alpha_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv40_blend_state {
+	struct pipe_blend_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+
+struct nv40_state {
+	unsigned scissor_enabled;
+	unsigned stipple_enabled;
+	unsigned viewport_bypass;
+	unsigned fp_samplers;
+
+	uint64_t dirty;
+	struct nouveau_stateobj *hw[NV40_STATE_MAX];
+};
+
+struct nv40_context {
+	struct pipe_context pipe;
+
+	struct nouveau_winsys *nvws;
+	struct nv40_screen *screen;
+	unsigned pctx_id;
+
+	struct draw_context *draw;
+
+	/* HW state derived from pipe states */
+	struct nv40_state state;
+	struct {
+		struct nv40_vertex_program *vertprog;
+
+		unsigned nr_attribs;
+		unsigned hw[PIPE_MAX_SHADER_INPUTS];
+		unsigned draw[PIPE_MAX_SHADER_INPUTS];
+		unsigned emit[PIPE_MAX_SHADER_INPUTS];
+	} swtnl;
+
+	enum {
+		HW, SWTNL, SWRAST
+	} render_mode;
+	unsigned fallback_swtnl;
+	unsigned fallback_swrast;
+
+	/* Context state */
+	unsigned dirty, draw_dirty;
+	struct pipe_scissor_state scissor;
+	unsigned stipple[32];
+	struct pipe_clip_state clip;
+	struct nv40_vertex_program *vertprog;
+	struct nv40_fragment_program *fragprog;
+	struct pipe_buffer *constbuf[PIPE_SHADER_TYPES];
+	unsigned constbuf_nr[PIPE_SHADER_TYPES];
+	struct nv40_rasterizer_state *rasterizer;
+	struct nv40_zsa_state *zsa;
+	struct nv40_blend_state *blend;
+	struct pipe_blend_color blend_colour;
+	struct pipe_viewport_state viewport;
+	struct pipe_framebuffer_state framebuffer;
+	struct pipe_buffer *idxbuf;
+	unsigned idxbuf_format;
+	struct nv40_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
+	struct nv40_miptree *tex_miptree[PIPE_MAX_SAMPLERS];
+	unsigned nr_samplers;
+	unsigned nr_textures;
+	unsigned dirty_samplers;
+	struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
+	unsigned vtxbuf_nr;
+	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
+	unsigned vtxelt_nr;
+	const unsigned *edgeflags;
+};
+
+static INLINE struct nv40_context *
+nv40_context(struct pipe_context *pipe)
+{
+	return (struct nv40_context *)pipe;
+}
+
+struct nv40_state_entry {
+	boolean (*validate)(struct nv40_context *nv40);
+	struct {
+		unsigned pipe;
+		unsigned hw;
+	} dirty;
+};
+
+extern void nv40_init_state_functions(struct nv40_context *nv40);
+extern void nv40_init_surface_functions(struct nv40_context *nv40);
+extern void nv40_init_query_functions(struct nv40_context *nv40);
+
+extern void nv40_screen_init_miptree_functions(struct pipe_screen *pscreen);
+
+/* nv40_draw.c */
+extern struct draw_stage *nv40_draw_render_stage(struct nv40_context *nv40);
+extern boolean nv40_draw_elements_swtnl(struct pipe_context *pipe,
+					struct pipe_buffer *idxbuf,
+					unsigned ib_size, unsigned mode,
+					unsigned start, unsigned count);
+
+/* nv40_vertprog.c */
+extern void nv40_vertprog_destroy(struct nv40_context *,
+				  struct nv40_vertex_program *);
+
+/* nv40_fragprog.c */
+extern void nv40_fragprog_destroy(struct nv40_context *,
+				  struct nv40_fragment_program *);
+
+/* nv40_fragtex.c */
+extern void nv40_fragtex_bind(struct nv40_context *);
+
+/* nv40_state.c and friends */
+extern boolean nv40_state_validate(struct nv40_context *nv40);
+extern boolean nv40_state_validate_swtnl(struct nv40_context *nv40);
+extern void nv40_state_emit(struct nv40_context *nv40);
+extern struct nv40_state_entry nv40_state_rasterizer;
+extern struct nv40_state_entry nv40_state_scissor;
+extern struct nv40_state_entry nv40_state_stipple;
+extern struct nv40_state_entry nv40_state_fragprog;
+extern struct nv40_state_entry nv40_state_vertprog;
+extern struct nv40_state_entry nv40_state_blend;
+extern struct nv40_state_entry nv40_state_blend_colour;
+extern struct nv40_state_entry nv40_state_zsa;
+extern struct nv40_state_entry nv40_state_viewport;
+extern struct nv40_state_entry nv40_state_framebuffer;
+extern struct nv40_state_entry nv40_state_fragtex;
+extern struct nv40_state_entry nv40_state_vbo;
+extern struct nv40_state_entry nv40_state_vtxfmt;
+
+/* nv40_vbo.c */
+extern boolean nv40_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv40_draw_elements(struct pipe_context *pipe,
+				  struct pipe_buffer *indexBuffer,
+				  unsigned indexSize,
+				  unsigned mode, unsigned start,
+				  unsigned count);
+
+/* nv40_clear.c */
+extern void nv40_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+#endif
diff --git a/src/gallium/drivers/nv40/nv40_draw.c b/src/gallium/drivers/nv40/nv40_draw.c
new file mode 100644
index 00000000000..8e56cdc2fe0
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_draw.c
@@ -0,0 +1,349 @@
+#include "pipe/p_shader_tokens.h"
+
+#include "util/u_pack_color.h"
+
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_pipe.h"
+
+#include "nv40_context.h"
+#define NV40_SHADER_NO_FUCKEDNESS
+#include "nv40_shader.h"
+
+/* Simple, but crappy, swtnl path, hopefully we wont need to hit this very
+ * often at all.  Uses "quadro style" vertex submission + a fixed vertex
+ * layout to avoid the need to generate a vertex program or vtxfmt.
+ */
+
+struct nv40_render_stage {
+	struct draw_stage stage;
+	struct nv40_context *nv40;
+	unsigned prim;
+};
+
+static INLINE struct nv40_render_stage *
+nv40_render_stage(struct draw_stage *stage)
+{
+	return (struct nv40_render_stage *)stage;
+}
+
+static INLINE void
+nv40_render_vertex(struct nv40_context *nv40, const struct vertex_header *v)
+{
+	unsigned i;
+
+	for (i = 0; i < nv40->swtnl.nr_attribs; i++) {
+		unsigned idx = nv40->swtnl.draw[i];
+		unsigned hw = nv40->swtnl.hw[i];
+
+		switch (nv40->swtnl.emit[i]) {
+		case EMIT_OMIT:
+			break;
+		case EMIT_1F:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_1F(hw), 1);
+			OUT_RING  (fui(v->data[idx][0]));
+			break;
+		case EMIT_2F:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_2F_X(hw), 2);
+			OUT_RING  (fui(v->data[idx][0]));
+			OUT_RING  (fui(v->data[idx][1]));
+			break;
+		case EMIT_3F:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_3F_X(hw), 3);
+			OUT_RING  (fui(v->data[idx][0]));
+			OUT_RING  (fui(v->data[idx][1]));
+			OUT_RING  (fui(v->data[idx][2]));
+			break;
+		case EMIT_4F:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_4F_X(hw), 4);
+			OUT_RING  (fui(v->data[idx][0]));
+			OUT_RING  (fui(v->data[idx][1]));
+			OUT_RING  (fui(v->data[idx][2]));
+			OUT_RING  (fui(v->data[idx][3]));
+			break;
+		case EMIT_4UB:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_4UB(hw), 1);
+			OUT_RING  (pack_ub4(float_to_ubyte(v->data[idx][0]),
+					    float_to_ubyte(v->data[idx][1]),
+					    float_to_ubyte(v->data[idx][2]),
+					    float_to_ubyte(v->data[idx][3])));
+			break;
+		default:
+			assert(0);
+			break;
+		}
+	}
+}
+
+static INLINE void
+nv40_render_prim(struct draw_stage *stage, struct prim_header *prim,
+	       unsigned mode, unsigned count)
+{
+	struct nv40_render_stage *rs = nv40_render_stage(stage);
+	struct nv40_context *nv40 = rs->nv40;
+	struct nouveau_pushbuf *pb = nv40->nvws->channel->pushbuf;
+	unsigned i;
+
+	/* Ensure there's room for 4xfloat32 + potentially 3 begin/end */
+	if (pb->remaining < ((count * 20) + 6)) {
+		if (rs->prim != NV40TCL_BEGIN_END_STOP) {
+			NOUVEAU_ERR("AIII, missed flush\n");
+			assert(0);
+		}
+		FIRE_RING(NULL);
+		nv40_state_emit(nv40);
+	}
+
+	/* Switch primitive modes if necessary */
+	if (rs->prim != mode) {
+		if (rs->prim != NV40TCL_BEGIN_END_STOP) {
+			BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+			OUT_RING  (NV40TCL_BEGIN_END_STOP);	
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (mode);
+		rs->prim = mode;
+	}
+
+	/* Emit vertex data */
+	for (i = 0; i < count; i++)
+		nv40_render_vertex(nv40, prim->v[i]);
+
+	/* If it's likely we'll need to empty the push buffer soon, finish
+	 * off the primitive now.
+	 */
+	if (pb->remaining < ((count * 20) + 6)) {
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (NV40TCL_BEGIN_END_STOP);
+		rs->prim = NV40TCL_BEGIN_END_STOP;
+	}
+}
+
+static void
+nv40_render_point(struct draw_stage *draw, struct prim_header *prim)
+{
+	nv40_render_prim(draw, prim, NV40TCL_BEGIN_END_POINTS, 1);
+}
+
+static void
+nv40_render_line(struct draw_stage *draw, struct prim_header *prim)
+{
+	nv40_render_prim(draw, prim, NV40TCL_BEGIN_END_LINES, 2);
+}
+
+static void
+nv40_render_tri(struct draw_stage *draw, struct prim_header *prim)
+{
+	nv40_render_prim(draw, prim, NV40TCL_BEGIN_END_TRIANGLES, 3);
+}
+
+static void
+nv40_render_flush(struct draw_stage *draw, unsigned flags)
+{
+	struct nv40_render_stage *rs = nv40_render_stage(draw);
+	struct nv40_context *nv40 = rs->nv40;
+
+	if (rs->prim != NV40TCL_BEGIN_END_STOP) {
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (NV40TCL_BEGIN_END_STOP);
+		rs->prim = NV40TCL_BEGIN_END_STOP;
+	}
+}
+
+static void
+nv40_render_reset_stipple_counter(struct draw_stage *draw)
+{
+}
+
+static void
+nv40_render_destroy(struct draw_stage *draw)
+{
+	FREE(draw);
+}
+
+static INLINE void
+emit_mov(struct nv40_vertex_program *vp,
+	 unsigned dst, unsigned src, unsigned vor, unsigned mask)
+{
+	struct nv40_vertex_program_exec *inst;
+
+	vp->insns = realloc(vp->insns,
+			    sizeof(struct nv40_vertex_program_exec) *
+			    ++vp->nr_insns);
+	inst = &vp->insns[vp->nr_insns - 1];
+
+	inst->data[0] = 0x401f9c6c;
+	inst->data[1] = 0x0040000d | (src << 8);
+	inst->data[2] = 0x8106c083;
+	inst->data[3] = 0x6041ff80 | (dst << 2) | (mask << 13);
+	inst->const_index = -1;
+	inst->has_branch_offset = FALSE;
+
+	vp->ir |= (1 << src);
+	if (vor != ~0)
+		vp->or |= (1 << vor);
+}
+
+static struct nv40_vertex_program *
+create_drawvp(struct nv40_context *nv40)
+{
+	struct nv40_vertex_program *vp = CALLOC_STRUCT(nv40_vertex_program);
+	unsigned i;
+
+	emit_mov(vp, NV40_VP_INST_DEST_POS, 0, ~0, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_COL0, 3, 0, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_COL1, 4, 1, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_BFC0, 3, 2, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_BFC1, 4, 3, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_FOGC, 5, 4, 0x8);
+	for (i = 0; i < 8; i++)
+		emit_mov(vp, NV40_VP_INST_DEST_TC(i), 8 + i, 14 + i, 0xf);
+
+	vp->insns[vp->nr_insns - 1].data[3] |= 1;
+	vp->translated = TRUE;
+	return vp;
+}
+
+struct draw_stage *
+nv40_draw_render_stage(struct nv40_context *nv40)
+{
+	struct nv40_render_stage *render = CALLOC_STRUCT(nv40_render_stage);
+
+	if (!nv40->swtnl.vertprog)
+		nv40->swtnl.vertprog = create_drawvp(nv40);
+
+	render->nv40 = nv40;
+	render->stage.draw = nv40->draw;
+	render->stage.point = nv40_render_point;
+	render->stage.line = nv40_render_line;
+	render->stage.tri = nv40_render_tri;
+	render->stage.flush = nv40_render_flush;
+	render->stage.reset_stipple_counter = nv40_render_reset_stipple_counter;
+	render->stage.destroy = nv40_render_destroy;
+
+	return &render->stage;
+}
+
+boolean
+nv40_draw_elements_swtnl(struct pipe_context *pipe,
+			 struct pipe_buffer *idxbuf, unsigned idxbuf_size,
+			 unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+	unsigned i;
+	void *map;
+
+	if (!nv40_state_validate_swtnl(nv40))
+		return FALSE;
+	nv40->state.dirty &= ~(1ULL << NV40_STATE_VTXBUF);
+	nv40_state_emit(nv40);
+
+	for (i = 0; i < nv40->vtxbuf_nr; i++) {
+		map = ws->buffer_map(ws, nv40->vtxbuf[i].buffer,
+				     PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_vertex_buffer(nv40->draw, i, map);
+	}
+
+	if (idxbuf) {
+		map = ws->buffer_map(ws, idxbuf, PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_element_buffer(nv40->draw, idxbuf_size, map);
+	} else {
+		draw_set_mapped_element_buffer(nv40->draw, 0, NULL);
+	}
+
+	if (nv40->constbuf[PIPE_SHADER_VERTEX]) {
+		const unsigned nr = nv40->constbuf_nr[PIPE_SHADER_VERTEX];
+
+		map = ws->buffer_map(ws, nv40->constbuf[PIPE_SHADER_VERTEX],
+				     PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_constant_buffer(nv40->draw, map, nr);
+	}
+
+	draw_arrays(nv40->draw, mode, start, count);
+
+	for (i = 0; i < nv40->vtxbuf_nr; i++)
+		ws->buffer_unmap(ws, nv40->vtxbuf[i].buffer);
+
+	if (idxbuf)
+		ws->buffer_unmap(ws, idxbuf);
+
+	if (nv40->constbuf[PIPE_SHADER_VERTEX])
+		ws->buffer_unmap(ws, nv40->constbuf[PIPE_SHADER_VERTEX]);
+
+	draw_flush(nv40->draw);
+	pipe->flush(pipe, 0, NULL);
+
+	return TRUE;
+}
+
+static INLINE void
+emit_attrib(struct nv40_context *nv40, unsigned hw, unsigned emit,
+	    unsigned semantic, unsigned index)
+{
+	unsigned draw_out = draw_find_vs_output(nv40->draw, semantic, index);
+	unsigned a = nv40->swtnl.nr_attribs++;
+
+	nv40->swtnl.hw[a] = hw;
+	nv40->swtnl.emit[a] = emit;
+	nv40->swtnl.draw[a] = draw_out;
+}
+
+static boolean
+nv40_state_vtxfmt_validate(struct nv40_context *nv40)
+{
+	struct nv40_fragment_program *fp = nv40->fragprog;
+	unsigned colour = 0, texcoords = 0, fog = 0, i;
+
+	/* Determine needed fragprog inputs */
+	for (i = 0; i < fp->info.num_inputs; i++) {
+		switch (fp->info.input_semantic_name[i]) {
+		case TGSI_SEMANTIC_POSITION:
+			break;
+		case TGSI_SEMANTIC_COLOR:
+			colour |= (1 << fp->info.input_semantic_index[i]);
+			break;
+		case TGSI_SEMANTIC_GENERIC:
+			texcoords |= (1 << fp->info.input_semantic_index[i]);
+			break;
+		case TGSI_SEMANTIC_FOG:
+			fog = 1;
+			break;
+		default:
+			assert(0);
+		}
+	}
+
+	nv40->swtnl.nr_attribs = 0;
+
+	/* Map draw vtxprog output to hw attribute IDs */
+	for (i = 0; i < 2; i++) {
+		if (!(colour & (1 << i)))
+			continue;
+		emit_attrib(nv40, 3 + i, EMIT_4UB, TGSI_SEMANTIC_COLOR, i);
+	}
+
+	for (i = 0; i < 8; i++) {
+		if (!(texcoords & (1 << i)))
+			continue;
+		emit_attrib(nv40, 8 + i, EMIT_4F, TGSI_SEMANTIC_GENERIC, i);
+	}
+
+	if (fog) {
+		emit_attrib(nv40, 5, EMIT_1F, TGSI_SEMANTIC_FOG, 0);
+	}
+
+	emit_attrib(nv40, 0, EMIT_3F, TGSI_SEMANTIC_POSITION, 0);
+
+	return FALSE;
+}
+
+struct nv40_state_entry nv40_state_vtxfmt = {
+	.validate = nv40_state_vtxfmt_validate,
+	.dirty = {
+		.pipe = NV40_NEW_ARRAYS | NV40_NEW_FRAGPROG,
+		.hw = 0
+	}
+};
+
diff --git a/src/gallium/drivers/nv40/nv40_fragprog.c b/src/gallium/drivers/nv40/nv40_fragprog.c
new file mode 100644
index 00000000000..91dcbebda0d
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_fragprog.c
@@ -0,0 +1,991 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv40_context.h"
+
+#define SWZ_X 0
+#define SWZ_Y 1
+#define SWZ_Z 2
+#define SWZ_W 3
+#define MASK_X 1
+#define MASK_Y 2
+#define MASK_Z 4
+#define MASK_W 8
+#define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
+#define DEF_SCALE NV40_FP_OP_DST_SCALE_1X
+#define DEF_CTEST NV40_FP_OP_COND_TR
+#include "nv40_shader.h"
+
+#define swz(s,x,y,z,w) nv40_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
+#define neg(s) nv40_sr_neg((s))
+#define abs(s) nv40_sr_abs((s))
+#define scale(s,v) nv40_sr_scale((s), NV40_FP_OP_DST_SCALE_##v)
+
+#define MAX_CONSTS 128
+#define MAX_IMM 32
+struct nv40_fpc {
+	struct nv40_fragment_program *fp;
+
+	uint attrib_map[PIPE_MAX_SHADER_INPUTS];
+
+	unsigned r_temps;
+	unsigned r_temps_discard;
+	struct nv40_sreg r_result[PIPE_MAX_SHADER_OUTPUTS];
+	struct nv40_sreg *r_temp;
+
+	int num_regs;
+
+	unsigned inst_offset;
+	unsigned have_const;
+
+	struct {
+		int pipe;
+		float vals[4];
+	} consts[MAX_CONSTS];
+	int nr_consts;
+
+	struct nv40_sreg imm[MAX_IMM];
+	unsigned nr_imm;
+};
+
+static INLINE struct nv40_sreg
+temp(struct nv40_fpc *fpc)
+{
+	int idx = ffs(~fpc->r_temps) - 1;
+
+	if (idx < 0) {
+		NOUVEAU_ERR("out of temps!!\n");
+		assert(0);
+		return nv40_sr(NV40SR_TEMP, 0);
+	}
+
+	fpc->r_temps |= (1 << idx);
+	fpc->r_temps_discard |= (1 << idx);
+	return nv40_sr(NV40SR_TEMP, idx);
+}
+
+static INLINE void
+release_temps(struct nv40_fpc *fpc)
+{
+	fpc->r_temps &= ~fpc->r_temps_discard;
+	fpc->r_temps_discard = 0;
+}
+
+static INLINE struct nv40_sreg
+constant(struct nv40_fpc *fpc, int pipe, float vals[4])
+{
+	int idx;
+
+	if (fpc->nr_consts == MAX_CONSTS)
+		assert(0);
+	idx = fpc->nr_consts++;
+
+	fpc->consts[idx].pipe = pipe;
+	if (pipe == -1)
+		memcpy(fpc->consts[idx].vals, vals, 4 * sizeof(float));
+	return nv40_sr(NV40SR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nv40_fp_arith((cc), (s), NV40_FP_OP_OPCODE_##o, \
+			(d), (m), (s0), (s1), (s2))
+#define tex(cc,s,o,u,d,m,s0,s1,s2) \
+	nv40_fp_tex((cc), (s), NV40_FP_OP_OPCODE_##o, (u), \
+		    (d), (m), (s0), none, none)
+
+static void
+grow_insns(struct nv40_fpc *fpc, int size)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+
+	fp->insn_len += size;
+	fp->insn = realloc(fp->insn, sizeof(uint32_t) * fp->insn_len);
+}
+
+static void
+emit_src(struct nv40_fpc *fpc, int pos, struct nv40_sreg src)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fpc->inst_offset];
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NV40SR_INPUT:
+		sr |= (NV40_FP_REG_TYPE_INPUT << NV40_FP_REG_TYPE_SHIFT);
+		hw[0] |= (src.index << NV40_FP_OP_INPUT_SRC_SHIFT);
+		break;
+	case NV40SR_OUTPUT:
+		sr |= NV40_FP_REG_SRC_HALF;
+		/* fall-through */
+	case NV40SR_TEMP:
+		sr |= (NV40_FP_REG_TYPE_TEMP << NV40_FP_REG_TYPE_SHIFT);
+		sr |= (src.index << NV40_FP_REG_SRC_SHIFT);
+		break;
+	case NV40SR_CONST:
+		if (!fpc->have_const) {
+			grow_insns(fpc, 4);
+			fpc->have_const = 1;
+		}
+
+		hw = &fp->insn[fpc->inst_offset];
+		if (fpc->consts[src.index].pipe >= 0) {
+			struct nv40_fragment_program_data *fpd;
+
+			fp->consts = realloc(fp->consts, ++fp->nr_consts *
+					     sizeof(*fpd));
+			fpd = &fp->consts[fp->nr_consts - 1];
+			fpd->offset = fpc->inst_offset + 4;
+			fpd->index = fpc->consts[src.index].pipe;
+			memset(&fp->insn[fpd->offset], 0, sizeof(uint32_t) * 4);
+		} else {
+			memcpy(&fp->insn[fpc->inst_offset + 4],
+				fpc->consts[src.index].vals,
+				sizeof(uint32_t) * 4);
+		}
+
+		sr |= (NV40_FP_REG_TYPE_CONST << NV40_FP_REG_TYPE_SHIFT);	
+		break;
+	case NV40SR_NONE:
+		sr |= (NV40_FP_REG_TYPE_INPUT << NV40_FP_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NV40_FP_REG_NEGATE;
+
+	if (src.abs)
+		hw[1] |= (1 << (29 + pos));
+
+	sr |= ((src.swz[0] << NV40_FP_REG_SWZ_X_SHIFT) |
+	       (src.swz[1] << NV40_FP_REG_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NV40_FP_REG_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NV40_FP_REG_SWZ_W_SHIFT));
+
+	hw[pos + 1] |= sr;
+}
+
+static void
+emit_dst(struct nv40_fpc *fpc, struct nv40_sreg dst)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fpc->inst_offset];
+
+	switch (dst.type) {
+	case NV40SR_TEMP:
+		if (fpc->num_regs < (dst.index + 1))
+			fpc->num_regs = dst.index + 1;
+		break;
+	case NV40SR_OUTPUT:
+		if (dst.index == 1) {
+			fp->fp_control |= 0xe;
+		} else {
+			hw[0] |= NV40_FP_OP_OUT_REG_HALF;
+		}
+		break;
+	case NV40SR_NONE:
+		hw[0] |= (1 << 30);
+		break;
+	default:
+		assert(0);
+	}
+
+	hw[0] |= (dst.index << NV40_FP_OP_OUT_REG_SHIFT);
+}
+
+static void
+nv40_fp_arith(struct nv40_fpc *fpc, int sat, int op,
+	      struct nv40_sreg dst, int mask,
+	      struct nv40_sreg s0, struct nv40_sreg s1, struct nv40_sreg s2)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+	uint32_t *hw;
+
+	fpc->inst_offset = fp->insn_len;
+	fpc->have_const = 0;
+	grow_insns(fpc, 4);
+	hw = &fp->insn[fpc->inst_offset];
+	memset(hw, 0, sizeof(uint32_t) * 4);
+
+	if (op == NV40_FP_OP_OPCODE_KIL)
+		fp->fp_control |= NV40TCL_FP_CONTROL_KIL;
+	hw[0] |= (op << NV40_FP_OP_OPCODE_SHIFT);
+	hw[0] |= (mask << NV40_FP_OP_OUTMASK_SHIFT);
+	hw[2] |= (dst.dst_scale << NV40_FP_OP_DST_SCALE_SHIFT);
+
+	if (sat)
+		hw[0] |= NV40_FP_OP_OUT_SAT;
+
+	if (dst.cc_update)
+		hw[0] |= NV40_FP_OP_COND_WRITE_ENABLE;
+	hw[1] |= (dst.cc_test << NV40_FP_OP_COND_SHIFT);
+	hw[1] |= ((dst.cc_swz[0] << NV40_FP_OP_COND_SWZ_X_SHIFT) |
+		  (dst.cc_swz[1] << NV40_FP_OP_COND_SWZ_Y_SHIFT) |
+		  (dst.cc_swz[2] << NV40_FP_OP_COND_SWZ_Z_SHIFT) |
+		  (dst.cc_swz[3] << NV40_FP_OP_COND_SWZ_W_SHIFT));
+
+	emit_dst(fpc, dst);
+	emit_src(fpc, 0, s0);
+	emit_src(fpc, 1, s1);
+	emit_src(fpc, 2, s2);
+}
+
+static void
+nv40_fp_tex(struct nv40_fpc *fpc, int sat, int op, int unit,
+	    struct nv40_sreg dst, int mask,
+	    struct nv40_sreg s0, struct nv40_sreg s1, struct nv40_sreg s2)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+
+	nv40_fp_arith(fpc, sat, op, dst, mask, s0, s1, s2);
+
+	fp->insn[fpc->inst_offset] |= (unit << NV40_FP_OP_TEX_UNIT_SHIFT);
+	fp->samplers |= (1 << unit);
+}
+
+static INLINE struct nv40_sreg
+tgsi_src(struct nv40_fpc *fpc, const struct tgsi_full_src_register *fsrc)
+{
+	struct nv40_sreg src;
+
+	switch (fsrc->SrcRegister.File) {
+	case TGSI_FILE_INPUT:
+		src = nv40_sr(NV40SR_INPUT,
+			      fpc->attrib_map[fsrc->SrcRegister.Index]);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(fpc, fsrc->SrcRegister.Index, NULL);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		assert(fsrc->SrcRegister.Index < fpc->nr_imm);
+		src = fpc->imm[fsrc->SrcRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		src = fpc->r_temp[fsrc->SrcRegister.Index];
+		break;
+	/* NV40 fragprog result regs are just temps, so this is simple */
+	case TGSI_FILE_OUTPUT:
+		src = fpc->r_result[fsrc->SrcRegister.Index];
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->SrcRegisterExtMod.Absolute;
+	src.negate = fsrc->SrcRegister.Negate;
+	src.swz[0] = fsrc->SrcRegister.SwizzleX;
+	src.swz[1] = fsrc->SrcRegister.SwizzleY;
+	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
+	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	return src;
+}
+
+static INLINE struct nv40_sreg
+tgsi_dst(struct nv40_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
+	switch (fdst->DstRegister.File) {
+	case TGSI_FILE_OUTPUT:
+		return fpc->r_result[fdst->DstRegister.Index];
+	case TGSI_FILE_TEMPORARY:
+		return fpc->r_temp[fdst->DstRegister.Index];
+	case TGSI_FILE_NULL:
+		return nv40_sr(NV40SR_NONE, 0);
+	default:
+		NOUVEAU_ERR("bad dst file %d\n", fdst->DstRegister.File);
+		return nv40_sr(NV40SR_NONE, 0);
+	}
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
+	return mask;
+}
+
+static boolean
+src_native_swz(struct nv40_fpc *fpc, const struct tgsi_full_src_register *fsrc,
+	       struct nv40_sreg *src)
+{
+	const struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
+	struct nv40_sreg tgsi = tgsi_src(fpc, fsrc);
+	uint mask = 0, zero_mask = 0, one_mask = 0, neg_mask = 0;
+	uint neg[4] = { fsrc->SrcRegisterExtSwz.NegateX,
+			fsrc->SrcRegisterExtSwz.NegateY,
+			fsrc->SrcRegisterExtSwz.NegateZ,
+			fsrc->SrcRegisterExtSwz.NegateW };
+	uint c;
+
+	for (c = 0; c < 4; c++) {
+		switch (tgsi_util_get_full_src_register_extswizzle(fsrc, c)) {
+		case TGSI_EXTSWIZZLE_X:
+		case TGSI_EXTSWIZZLE_Y:
+		case TGSI_EXTSWIZZLE_Z:
+		case TGSI_EXTSWIZZLE_W:
+			mask |= (1 << c);
+			break;
+		case TGSI_EXTSWIZZLE_ZERO:
+			zero_mask |= (1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		case TGSI_EXTSWIZZLE_ONE:
+			one_mask |= (1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		default:
+			assert(0);
+		}
+
+		if (!tgsi.negate && neg[c])
+			neg_mask |= (1 << c);
+	}
+
+	if (mask == MASK_ALL && !neg_mask)
+		return TRUE;
+
+	*src = temp(fpc);
+
+	if (mask)
+		arith(fpc, 0, MOV, *src, mask, tgsi, none, none);
+
+	if (zero_mask)
+		arith(fpc, 0, SFL, *src, zero_mask, *src, none, none);
+
+	if (one_mask)
+		arith(fpc, 0, STR, *src, one_mask, *src, none, none);
+
+	if (neg_mask) {
+		struct nv40_sreg one = temp(fpc);
+		arith(fpc, 0, STR, one, neg_mask, one, none, none);
+		arith(fpc, 0, MUL, *src, neg_mask, *src, neg(one), none);
+	}
+
+	return FALSE;
+}
+
+static boolean
+nv40_fragprog_parse_instruction(struct nv40_fpc *fpc,
+				const struct tgsi_full_instruction *finst)
+{
+	const struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
+	struct nv40_sreg src[3], dst, tmp;
+	int mask, sat, unit;
+	int ai = -1, ci = -1, ii = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(fpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_TEMPORARY:
+			if (!src_native_swz(fpc, fsrc, &src[i]))
+				continue;
+			break;
+		default:
+			break;
+		}
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
+				ai = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_CONSTANT:
+			if ((ci == -1 && ii == -1) ||
+			    ci == fsrc->SrcRegister.Index) {
+				ci = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_IMMEDIATE:
+			if ((ci == -1 && ii == -1) ||
+			    ii == fsrc->SrcRegister.Index) {
+				ii = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		case TGSI_FILE_SAMPLER:
+			unit = fsrc->SrcRegister.Index;
+			break;
+		case TGSI_FILE_OUTPUT:
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(fpc, &finst->FullDstRegisters[0]);
+	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+	sat  = (finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(fpc, sat, MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(fpc, sat, ADD, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_CMP:
+		tmp = temp(fpc);
+		arith(fpc, sat, MOV, dst, mask, src[2], none, none);
+		tmp.cc_update = 1;
+		arith(fpc, 0, MOV, tmp, 0xf, src[0], none, none);
+		dst.cc_test = NV40_VP_INST_COND_LT;
+		arith(fpc, sat, MOV, dst, mask, src[1], none, none);
+		break;
+	case TGSI_OPCODE_COS:
+		arith(fpc, sat, COS, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DDX:
+		if (mask & (MASK_Z | MASK_W)) {
+			tmp = temp(fpc);
+			arith(fpc, sat, DDX, tmp, MASK_X | MASK_Y,
+			      swz(src[0], Z, W, Z, W), none, none);
+			arith(fpc, 0, MOV, tmp, MASK_Z | MASK_W,
+			      swz(tmp, X, Y, X, Y), none, none);
+			arith(fpc, sat, DDX, tmp, MASK_X | MASK_Y, src[0],
+			      none, none);
+			arith(fpc, 0, MOV, dst, mask, tmp, none, none);
+		} else {
+			arith(fpc, sat, DDX, dst, mask, src[0], none, none);
+		}
+		break;
+	case TGSI_OPCODE_DDY:
+		if (mask & (MASK_Z | MASK_W)) {
+			tmp = temp(fpc);
+			arith(fpc, sat, DDY, tmp, MASK_X | MASK_Y,
+			      swz(src[0], Z, W, Z, W), none, none);
+			arith(fpc, 0, MOV, tmp, MASK_Z | MASK_W,
+			      swz(tmp, X, Y, X, Y), none, none);
+			arith(fpc, sat, DDY, tmp, MASK_X | MASK_Y, src[0],
+			      none, none);
+			arith(fpc, 0, MOV, dst, mask, tmp, none, none);
+		} else {
+			arith(fpc, sat, DDY, dst, mask, src[0], none, none);
+		}
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(fpc, sat, DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(fpc, sat, DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		tmp = temp(fpc);
+		arith(fpc, 0, DP3, tmp, MASK_X, src[0], src[1], none);
+		arith(fpc, sat, ADD, dst, mask, swz(tmp, X, X, X, X),
+		      swz(src[1], W, W, W, W), none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(fpc, sat, DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(fpc, sat, EX2, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(fpc, sat, FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(fpc, sat, FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_KILP:
+		arith(fpc, 0, KIL, none, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_KIL:
+		dst = nv40_sr(NV40SR_NONE, 0);
+		dst.cc_update = 1;
+		arith(fpc, 0, MOV, dst, MASK_ALL, src[0], none, none);
+		dst.cc_update = 0; dst.cc_test = NV40_FP_OP_COND_LT;
+		arith(fpc, 0, KIL, dst, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(fpc, sat, LG2, dst, mask, src[0], none, none);
+		break;
+//	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_LRP:
+		tmp = temp(fpc);
+		arith(fpc, 0, MAD, tmp, mask, neg(src[0]), src[2], src[2]);
+		arith(fpc, sat, MAD, dst, mask, src[0], src[1], tmp);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(fpc, sat, MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(fpc, sat, MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(fpc, sat, MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(fpc, sat, MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(fpc, sat, MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_NOISE1:
+	case TGSI_OPCODE_NOISE2:
+	case TGSI_OPCODE_NOISE3:
+	case TGSI_OPCODE_NOISE4:
+		arith(fpc, sat, SFL, dst, mask, none, none, none);
+		break;
+	case TGSI_OPCODE_POW:
+		tmp = temp(fpc);
+		arith(fpc, 0, LG2, tmp, MASK_X,
+		      swz(src[0], X, X, X, X), none, none);
+		arith(fpc, 0, MUL, tmp, MASK_X, swz(tmp, X, X, X, X),
+		      swz(src[1], X, X, X, X), none);
+		arith(fpc, sat, EX2, dst, mask,
+		      swz(tmp, X, X, X, X), none, none);
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(fpc, sat, RCP, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_RET:
+		assert(0);
+		break;
+	case TGSI_OPCODE_RFL:
+		tmp = temp(fpc);
+		arith(fpc, 0, DP3, tmp, MASK_X, src[0], src[0], none);
+		arith(fpc, 0, DP3, tmp, MASK_Y, src[0], src[1], none);
+		arith(fpc, 0, DIV, scale(tmp, 2X), MASK_Z,
+		      swz(tmp, Y, Y, Y, Y), swz(tmp, X, X, X, X), none);
+		arith(fpc, sat, MAD, dst, mask,
+		      swz(tmp, Z, Z, Z, Z), src[0], neg(src[1]));
+		break;
+	case TGSI_OPCODE_RSQ:
+		tmp = temp(fpc);
+		arith(fpc, 0, LG2, scale(tmp, INV_2X), MASK_X,
+		      abs(swz(src[0], X, X, X, X)), none, none);
+		arith(fpc, sat, EX2, dst, mask,
+		      neg(swz(tmp, X, X, X, X)), none, none);
+		break;
+	case TGSI_OPCODE_SCS:
+		if (mask & MASK_X) {
+			arith(fpc, sat, COS, dst, MASK_X,
+			      swz(src[0], X, X, X, X), none, none);
+		}
+		if (mask & MASK_Y) {
+			arith(fpc, sat, SIN, dst, MASK_Y,
+			      swz(src[0], X, X, X, X), none, none);
+		}
+		break;
+	case TGSI_OPCODE_SEQ:
+		arith(fpc, sat, SEQ, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SFL:
+		arith(fpc, sat, SFL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(fpc, sat, SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGT:
+		arith(fpc, sat, SGT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SIN:
+		arith(fpc, sat, SIN, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_SLE:
+		arith(fpc, sat, SLE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(fpc, sat, SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SNE:
+		arith(fpc, sat, SNE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_STR:
+		arith(fpc, sat, STR, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(fpc, sat, ADD, dst, mask, src[0], neg(src[1]), none);
+		break;
+	case TGSI_OPCODE_TEX:
+		tex(fpc, sat, TEX, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_TXB:
+		tex(fpc, sat, TXB, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_TXP:
+		tex(fpc, sat, TXP, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(fpc);
+		arith(fpc, 0, MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(fpc, sat, MAD, dst, (mask & ~MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	release_temps(fpc);
+	return TRUE;
+}
+
+static boolean
+nv40_fragprog_parse_decl_attrib(struct nv40_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	int hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NV40_FP_OP_INPUT_SRC_POSITION;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV40_FP_OP_INPUT_SRC_COL0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV40_FP_OP_INPUT_SRC_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NV40_FP_OP_INPUT_SRC_FOGC;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.SemanticIndex <= 7) {
+			hw = NV40_FP_OP_INPUT_SRC_TC(fdec->Semantic.
+						     SemanticIndex);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad input semantic\n");
+		return FALSE;
+	}
+
+	fpc->attrib_map[fdec->DeclarationRange.First] = hw;
+	return TRUE;
+}
+
+static boolean
+nv40_fragprog_parse_decl_output(struct nv40_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	unsigned idx = fdec->DeclarationRange.First;
+	unsigned hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = 1;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		switch (fdec->Semantic.SemanticIndex) {
+		case 0: hw = 0; break;
+		case 1: hw = 2; break;
+		case 2: hw = 3; break;
+		case 3: hw = 4; break;
+		default:
+			NOUVEAU_ERR("bad rcol index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	fpc->r_result[idx] = nv40_sr(NV40SR_OUTPUT, hw);
+	fpc->r_temps |= (1 << hw);
+	return TRUE;
+}
+
+static boolean
+nv40_fragprog_prepare(struct nv40_fpc *fpc)
+{
+	struct tgsi_parse_context p;
+	int high_temp = -1, i;
+
+	tgsi_parse_init(&p, fpc->fp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+			fdec = &p.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_INPUT:
+				if (!nv40_fragprog_parse_decl_attrib(fpc, fdec))
+					goto out_err;
+				break;
+			case TGSI_FILE_OUTPUT:
+				if (!nv40_fragprog_parse_decl_output(fpc, fdec))
+					goto out_err;
+				break;
+			case TGSI_FILE_TEMPORARY:
+				if (fdec->DeclarationRange.Last > high_temp) {
+					high_temp =
+						fdec->DeclarationRange.Last;
+				}
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			struct tgsi_full_immediate *imm;
+			float vals[4];
+			
+			imm = &p.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+			assert(fpc->nr_imm < MAX_IMM);
+
+			vals[0] = imm->u.ImmediateFloat32[0].Float;
+			vals[1] = imm->u.ImmediateFloat32[1].Float;
+			vals[2] = imm->u.ImmediateFloat32[2].Float;
+			vals[3] = imm->u.ImmediateFloat32[3].Float;
+			fpc->imm[fpc->nr_imm++] = constant(fpc, -1, vals);
+		}
+			break;
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	if (++high_temp) {
+		fpc->r_temp = CALLOC(high_temp, sizeof(struct nv40_sreg));
+		for (i = 0; i < high_temp; i++)
+			fpc->r_temp[i] = temp(fpc);
+		fpc->r_temps_discard = 0;
+	}
+
+	return TRUE;
+
+out_err:
+	if (fpc->r_temp)
+		FREE(fpc->r_temp);
+	tgsi_parse_free(&p);
+	return FALSE;
+}
+
+static void
+nv40_fragprog_translate(struct nv40_context *nv40,
+			struct nv40_fragment_program *fp)
+{
+	struct tgsi_parse_context parse;
+	struct nv40_fpc *fpc = NULL;
+
+	fpc = CALLOC(1, sizeof(struct nv40_fpc));
+	if (!fpc)
+		return;
+	fpc->fp = fp;
+	fpc->num_regs = 2;
+
+	if (!nv40_fragprog_prepare(fpc)) {
+		FREE(fpc);
+		return;
+	}
+
+	tgsi_parse_init(&parse, fp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+
+			finst = &parse.FullToken.FullInstruction;
+			if (!nv40_fragprog_parse_instruction(fpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	fp->fp_control |= fpc->num_regs << NV40TCL_FP_CONTROL_TEMP_COUNT_SHIFT;
+
+	/* Terminate final instruction */
+	fp->insn[fpc->inst_offset] |= 0x00000001;
+
+	/* Append NOP + END instruction, may or may not be necessary. */
+	fpc->inst_offset = fp->insn_len;
+	grow_insns(fpc, 4);
+	fp->insn[fpc->inst_offset + 0] = 0x00000001;
+	fp->insn[fpc->inst_offset + 1] = 0x00000000;
+	fp->insn[fpc->inst_offset + 2] = 0x00000000;
+	fp->insn[fpc->inst_offset + 3] = 0x00000000;
+	
+	fp->translated = TRUE;
+out_err:
+	tgsi_parse_free(&parse);
+	if (fpc->r_temp)
+		FREE(fpc->r_temp);
+	FREE(fpc);
+}
+
+static void
+nv40_fragprog_upload(struct nv40_context *nv40,
+		     struct nv40_fragment_program *fp)
+{
+	struct pipe_winsys *ws = nv40->pipe.winsys;
+	const uint32_t le = 1;
+	uint32_t *map;
+	int i;
+
+	map = ws->buffer_map(ws, fp->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
+
+#if 0
+	for (i = 0; i < fp->insn_len; i++) {
+		fflush(stdout); fflush(stderr);
+		NOUVEAU_ERR("%d 0x%08x\n", i, fp->insn[i]);
+		fflush(stdout); fflush(stderr);
+	}
+#endif
+
+	if ((*(const uint8_t *)&le)) {
+		for (i = 0; i < fp->insn_len; i++) {
+			map[i] = fp->insn[i];
+		}
+	} else {
+		/* Weird swapping for big-endian chips */
+		for (i = 0; i < fp->insn_len; i++) {
+			map[i] = ((fp->insn[i] & 0xffff) << 16) |
+				  ((fp->insn[i] >> 16) & 0xffff);
+		}
+	}
+
+	ws->buffer_unmap(ws, fp->buffer);
+}
+
+static boolean
+nv40_fragprog_validate(struct nv40_context *nv40)
+{
+	struct nv40_fragment_program *fp = nv40->fragprog;
+	struct pipe_buffer *constbuf =
+		nv40->constbuf[PIPE_SHADER_FRAGMENT];
+	struct pipe_winsys *ws = nv40->pipe.winsys;
+	struct nouveau_stateobj *so;
+	boolean new_consts = FALSE;
+	int i;
+
+	if (fp->translated)
+		goto update_constants;
+
+	nv40->fallback_swrast &= ~NV40_NEW_FRAGPROG;
+	nv40_fragprog_translate(nv40, fp);
+	if (!fp->translated) {
+		nv40->fallback_swrast |= NV40_NEW_FRAGPROG;
+		return FALSE;
+	}
+
+	fp->buffer = ws->buffer_create(ws, 0x100, 0, fp->insn_len * 4);
+	nv40_fragprog_upload(nv40, fp);
+
+	so = so_new(4, 1);
+	so_method(so, nv40->screen->curie, NV40TCL_FP_ADDRESS, 1);
+	so_reloc (so, fp->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+		  NV40TCL_FP_ADDRESS_DMA0, NV40TCL_FP_ADDRESS_DMA1);
+	so_method(so, nv40->screen->curie, NV40TCL_FP_CONTROL, 1);
+	so_data  (so, fp->fp_control);
+	so_ref(so, &fp->so);
+
+update_constants:
+	if (fp->nr_consts) {
+		float *map;
+		
+		map = ws->buffer_map(ws, constbuf, PIPE_BUFFER_USAGE_CPU_READ);
+		for (i = 0; i < fp->nr_consts; i++) {
+			struct nv40_fragment_program_data *fpd = &fp->consts[i];
+			uint32_t *p = &fp->insn[fpd->offset];
+			uint32_t *cb = (uint32_t *)&map[fpd->index * 4];
+
+			if (!memcmp(p, cb, 4 * sizeof(float)))
+				continue;
+			memcpy(p, cb, 4 * sizeof(float));
+			new_consts = TRUE;
+		}
+		ws->buffer_unmap(ws, constbuf);
+
+		if (new_consts)
+			nv40_fragprog_upload(nv40, fp);
+	}
+
+	if (new_consts || fp->so != nv40->state.hw[NV40_STATE_FRAGPROG]) {
+		so_ref(fp->so, &nv40->state.hw[NV40_STATE_FRAGPROG]);
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+void
+nv40_fragprog_destroy(struct nv40_context *nv40,
+		      struct nv40_fragment_program *fp)
+{
+	if (fp->insn_len)
+		FREE(fp->insn);
+}
+
+struct nv40_state_entry nv40_state_fragprog = {
+	.validate = nv40_fragprog_validate,
+	.dirty = {
+		.pipe = NV40_NEW_FRAGPROG,
+		.hw = NV40_STATE_FRAGPROG
+	}
+};
+
diff --git a/src/gallium/drivers/nv40/nv40_fragtex.c b/src/gallium/drivers/nv40/nv40_fragtex.c
new file mode 100644
index 00000000000..566d5a8d5ba
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_fragtex.c
@@ -0,0 +1,169 @@
+#include "nv40_context.h"
+
+#define _(m,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w,sx,sy,sz,sw)            \
+{                                                                              \
+  TRUE,                                                                        \
+  PIPE_FORMAT_##m,                                                             \
+  NV40TCL_TEX_FORMAT_FORMAT_##tf,                                              \
+  (NV40TCL_TEX_SWIZZLE_S0_X_##ts0x | NV40TCL_TEX_SWIZZLE_S0_Y_##ts0y |         \
+   NV40TCL_TEX_SWIZZLE_S0_Z_##ts0z | NV40TCL_TEX_SWIZZLE_S0_W_##ts0w |         \
+   NV40TCL_TEX_SWIZZLE_S1_X_##ts1x | NV40TCL_TEX_SWIZZLE_S1_Y_##ts1y |         \
+   NV40TCL_TEX_SWIZZLE_S1_Z_##ts1z | NV40TCL_TEX_SWIZZLE_S1_W_##ts1w),         \
+  ((NV40TCL_TEX_FILTER_SIGNED_RED*sx) | (NV40TCL_TEX_FILTER_SIGNED_GREEN*sy) |       \
+   (NV40TCL_TEX_FILTER_SIGNED_BLUE*sz) | (NV40TCL_TEX_FILTER_SIGNED_ALPHA*sw))       \
+}
+
+struct nv40_texture_format {
+	boolean defined;
+	uint	pipe;
+	int     format;
+	int     swizzle;
+	int     sign;
+};
+
+static struct nv40_texture_format
+nv40_texture_formats[] = {
+	_(A8R8G8B8_UNORM, A8R8G8B8,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(A1R5G5B5_UNORM, A1R5G5B5,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(A4R4G4B4_UNORM, A4R4G4B4,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(R5G6B5_UNORM  , R5G6B5  ,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0, 0, 0, 0),
+	_(L8_UNORM      , L8      ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
+	_(A8_UNORM      , L8      , ZERO, ZERO, ZERO,   S1, X, X, X, X, 0, 0, 0, 0),
+	_(R16_SNORM     , A16     , ZERO, ZERO,   S1,  ONE, X, X, X, Y, 1, 1, 1, 1),
+	_(I8_UNORM      , L8      ,   S1,   S1,   S1,   S1, X, X, X, X, 0, 0, 0, 0),
+	_(A8L8_UNORM    , A8L8    ,   S1,   S1,   S1,   S1, X, X, X, Y, 0, 0, 0, 0),
+	_(Z16_UNORM     , Z16     ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
+	_(Z24S8_UNORM   , Z24     ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
+	_(DXT1_RGB      , DXT1    ,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0, 0, 0, 0),
+	_(DXT1_RGBA     , DXT1    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(DXT3_RGBA     , DXT3    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(DXT5_RGBA     , DXT5    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	{},
+};
+
+static struct nv40_texture_format *
+nv40_fragtex_format(uint pipe_format)
+{
+	struct nv40_texture_format *tf = nv40_texture_formats;
+
+	while (tf->defined) {
+		if (tf->pipe == pipe_format)
+			return tf;
+		tf++;
+	}
+
+	NOUVEAU_ERR("unknown texture format %s\n", pf_name(pipe_format));
+	return NULL;
+}
+
+
+static struct nouveau_stateobj *
+nv40_fragtex_build(struct nv40_context *nv40, int unit)
+{
+	struct nv40_sampler_state *ps = nv40->tex_sampler[unit];
+	struct nv40_miptree *nv40mt = nv40->tex_miptree[unit];
+	struct pipe_texture *pt = &nv40mt->base;
+	struct nv40_texture_format *tf;
+	struct nouveau_stateobj *so;
+	uint32_t txf, txs, txp;
+	int swizzled = 0; /*XXX: implement in region code? */
+	unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+
+	tf = nv40_fragtex_format(pt->format);
+	if (!tf)
+		assert(0);
+
+	txf  = ps->fmt;
+	txf |= tf->format | 0x8000;
+	txf |= ((pt->last_level + 1) << NV40TCL_TEX_FORMAT_MIPMAP_COUNT_SHIFT);
+
+	if (1) /* XXX */
+		txf |= NV40TCL_TEX_FORMAT_NO_BORDER;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_CUBE:
+		txf |= NV40TCL_TEX_FORMAT_CUBIC;
+		/* fall-through */
+	case PIPE_TEXTURE_2D:
+		txf |= NV40TCL_TEX_FORMAT_DIMS_2D;
+		break;
+	case PIPE_TEXTURE_3D:
+		txf |= NV40TCL_TEX_FORMAT_DIMS_3D;
+		break;
+	case PIPE_TEXTURE_1D:
+		txf |= NV40TCL_TEX_FORMAT_DIMS_1D;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return NULL;
+	}
+
+	if (swizzled) {
+		txp = 0;
+	} else {
+		txp  = nv40mt->level[0].pitch;
+		txf |= NV40TCL_TEX_FORMAT_LINEAR;
+	}
+
+	txs = tf->swizzle;
+
+	so = so_new(16, 2);
+	so_method(so, nv40->screen->curie, NV40TCL_TEX_OFFSET(unit), 8);
+	so_reloc (so, nv40mt->buffer, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
+	so_reloc (so, nv40mt->buffer, txf, tex_flags | NOUVEAU_BO_OR,
+		  NV40TCL_TEX_FORMAT_DMA0, NV40TCL_TEX_FORMAT_DMA1);
+	so_data  (so, ps->wrap);
+	so_data  (so, NV40TCL_TEX_ENABLE_ENABLE | ps->en);
+	so_data  (so, txs);
+	so_data  (so, ps->filt | tf->sign | 0x2000 /*voodoo*/);
+	so_data  (so, (pt->width[0] << NV40TCL_TEX_SIZE0_W_SHIFT) |
+		       pt->height[0]);
+	so_data  (so, ps->bcol);
+	so_method(so, nv40->screen->curie, NV40TCL_TEX_SIZE1(unit), 1);
+	so_data  (so, (pt->depth[0] << NV40TCL_TEX_SIZE1_DEPTH_SHIFT) | txp);
+
+	return so;
+}
+
+static boolean
+nv40_fragtex_validate(struct nv40_context *nv40)
+{
+	struct nv40_fragment_program *fp = nv40->fragprog;
+	struct nv40_state *state = &nv40->state;
+	struct nouveau_stateobj *so;
+	unsigned samplers, unit;
+
+	samplers = state->fp_samplers & ~fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		so = so_new(2, 0);
+		so_method(so, nv40->screen->curie, NV40TCL_TEX_ENABLE(unit), 1);
+		so_data  (so, 0);
+		so_ref(so, &nv40->state.hw[NV40_STATE_FRAGTEX0 + unit]);
+		state->dirty |= (1ULL << (NV40_STATE_FRAGTEX0 + unit));
+	}
+
+	samplers = nv40->dirty_samplers & fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		so = nv40_fragtex_build(nv40, unit);
+		so_ref(so, &nv40->state.hw[NV40_STATE_FRAGTEX0 + unit]);
+		state->dirty |= (1ULL << (NV40_STATE_FRAGTEX0 + unit));
+	}
+
+	nv40->state.fp_samplers = fp->samplers;
+	return FALSE;
+}
+
+struct nv40_state_entry nv40_state_fragtex = {
+	.validate = nv40_fragtex_validate,
+	.dirty = {
+		.pipe = NV40_NEW_SAMPLER | NV40_NEW_FRAGPROG,
+		.hw = 0
+	}
+};
+
diff --git a/src/gallium/drivers/nv40/nv40_miptree.c b/src/gallium/drivers/nv40/nv40_miptree.c
new file mode 100644
index 00000000000..f321b721492
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_miptree.c
@@ -0,0 +1,164 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "nv40_context.h"
+
+static void
+nv40_miptree_layout(struct nv40_miptree *mt)
+{
+	struct pipe_texture *pt = &mt->base;
+	boolean swizzled = FALSE;
+	uint width = pt->width[0], height = pt->height[0], depth = pt->depth[0];
+	uint offset = 0;
+	int nr_faces, l, f, pitch;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		nr_faces = 6;
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		nr_faces = pt->depth[0];
+	} else {
+		nr_faces = 1;
+	}
+
+	pitch = pt->width[0];
+	for (l = 0; l <= pt->last_level; l++) {
+		pt->width[l] = width;
+		pt->height[l] = height;
+		pt->depth[l] = depth;
+		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
+		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
+
+		if (swizzled)
+			pitch = pt->nblocksx[l];
+		pitch = align(pitch, 64);
+
+		mt->level[l].pitch = pitch * pt->block.size;
+		mt->level[l].image_offset =
+			CALLOC(nr_faces, sizeof(unsigned));
+
+		width  = MAX2(1, width  >> 1);
+		height = MAX2(1, height >> 1);
+		depth  = MAX2(1, depth  >> 1);
+	}
+
+	for (f = 0; f < nr_faces; f++) {
+		for (l = 0; l <= pt->last_level; l++) {
+			mt->level[l].image_offset[f] = offset;
+			offset += mt->level[l].pitch * pt->height[l];
+		}
+	}
+
+	mt->total_size = offset;
+}
+
+static struct pipe_texture *
+nv40_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
+{
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv40_miptree *mt;
+
+	mt = MALLOC(sizeof(struct nv40_miptree));
+	if (!mt)
+		return NULL;
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+
+	nv40_miptree_layout(mt);
+
+	mt->buffer = ws->buffer_create(ws, 256,
+				       PIPE_BUFFER_USAGE_PIXEL |
+				       NOUVEAU_BUFFER_USAGE_TEXTURE,
+				       mt->total_size);
+	if (!mt->buffer) {
+		FREE(mt);
+		return NULL;
+	}
+
+	return &mt->base;
+}
+
+static void
+nv40_miptree_release(struct pipe_screen *pscreen, struct pipe_texture **ppt)
+{
+	struct pipe_texture *pt = *ppt;
+	struct nv40_miptree *mt = (struct nv40_miptree *)pt;
+	int l;
+
+	*ppt = NULL;
+	if (--pt->refcount)
+		return;
+
+
+	pipe_buffer_reference(pscreen, &mt->buffer, NULL);
+	for (l = 0; l <= pt->last_level; l++) {
+		if (mt->level[l].image_offset)
+			FREE(mt->level[l].image_offset);
+	}
+
+	FREE(mt);
+}
+
+static struct pipe_surface *
+nv40_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct nv40_miptree *mt = (struct nv40_miptree *)pt;
+	struct pipe_surface *ps;
+
+	ps = CALLOC_STRUCT(pipe_surface);
+	if (!ps)
+		return NULL;
+	pipe_texture_reference(&ps->texture, pt);
+	pipe_buffer_reference(pscreen, &ps->buffer, mt->buffer);
+	ps->format = pt->format;
+	ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->block = pt->block;
+	ps->nblocksx = pt->nblocksx[level];
+	ps->nblocksy = pt->nblocksy[level];
+	ps->stride = mt->level[level].pitch;
+	ps->usage = flags;
+	ps->status = PIPE_SURFACE_STATUS_DEFINED;
+	ps->refcount = 1;
+	ps->winsys = pscreen->winsys;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		ps->offset = mt->level[level].image_offset[face];
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		ps->offset = mt->level[level].image_offset[zslice];
+	} else {
+		ps->offset = mt->level[level].image_offset[0];
+	}
+
+	return ps;
+}
+
+static void
+nv40_miptree_surface_del(struct pipe_screen *pscreen,
+			 struct pipe_surface **psurface)
+{
+	struct pipe_surface *ps = *psurface;
+
+	*psurface = NULL;
+	if (--ps->refcount > 0)
+		return;
+
+	pipe_texture_reference(&ps->texture, NULL);
+	pipe_buffer_reference(pscreen, &ps->buffer, NULL);
+	FREE(ps);
+}
+
+void
+nv40_screen_init_miptree_functions(struct pipe_screen *pscreen)
+{
+	pscreen->texture_create = nv40_miptree_create;
+	pscreen->texture_release = nv40_miptree_release;
+	pscreen->get_tex_surface = nv40_miptree_surface_new;
+	pscreen->tex_surface_release = nv40_miptree_surface_del;
+}
+
diff --git a/src/gallium/drivers/nv40/nv40_query.c b/src/gallium/drivers/nv40/nv40_query.c
new file mode 100644
index 00000000000..57f39cfab0c
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_query.c
@@ -0,0 +1,122 @@
+#include "pipe/p_context.h"
+
+#include "nv40_context.h"
+
+struct nv40_query {
+	struct nouveau_resource *object;
+	unsigned type;
+	boolean ready;
+	uint64_t result;
+};
+
+static INLINE struct nv40_query *
+nv40_query(struct pipe_query *pipe)
+{
+	return (struct nv40_query *)pipe;
+}
+
+static struct pipe_query *
+nv40_query_create(struct pipe_context *pipe, unsigned query_type)
+{
+	struct nv40_query *q;
+
+	q = CALLOC(1, sizeof(struct nv40_query));
+	q->type = query_type;
+
+	return (struct pipe_query *)q;
+}
+
+static void
+nv40_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_query *q = nv40_query(pq);
+
+	if (q->object)
+		nv40->nvws->res_free(&q->object);
+	FREE(q);
+}
+
+static void
+nv40_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_query *q = nv40_query(pq);
+
+	assert(q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+	/* Happens when end_query() is called, then another begin_query()
+	 * without querying the result in-between.  For now we'll wait for
+	 * the existing query to notify completion, but it could be better.
+	 */
+	if (q->object) {
+		uint64 tmp;
+		pipe->get_query_result(pipe, pq, 1, &tmp);
+	}
+
+	if (nv40->nvws->res_alloc(nv40->screen->query_heap, 1, NULL, &q->object))
+		assert(0);
+	nv40->nvws->notifier_reset(nv40->screen->query, q->object->start);
+
+	BEGIN_RING(curie, NV40TCL_QUERY_RESET, 1);
+	OUT_RING  (1);
+	BEGIN_RING(curie, NV40TCL_QUERY_UNK17CC, 1);
+	OUT_RING  (1);
+
+	q->ready = FALSE;
+}
+
+static void
+nv40_query_end(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_query *q = nv40_query(pq);
+
+	BEGIN_RING(curie, NV40TCL_QUERY_GET, 1);
+	OUT_RING  ((0x01 << NV40TCL_QUERY_GET_UNK24_SHIFT) |
+		   ((q->object->start * 32) << NV40TCL_QUERY_GET_OFFSET_SHIFT));
+	FIRE_RING(NULL);
+}
+
+static boolean
+nv40_query_result(struct pipe_context *pipe, struct pipe_query *pq,
+		  boolean wait, uint64 *result)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_query *q = nv40_query(pq);
+	struct nouveau_winsys *nvws = nv40->nvws;
+
+	assert(q->object && q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+	if (!q->ready) {
+		unsigned status;
+
+		status = nvws->notifier_status(nv40->screen->query,
+					       q->object->start);
+		if (status != NV_NOTIFY_STATE_STATUS_COMPLETED) {
+			if (wait == FALSE)
+				return FALSE;
+			nvws->notifier_wait(nv40->screen->query, q->object->start,
+					    NV_NOTIFY_STATE_STATUS_COMPLETED,
+					    0);
+		}
+
+		q->result = nvws->notifier_retval(nv40->screen->query,
+						  q->object->start);
+		q->ready = TRUE;
+		nvws->res_free(&q->object);
+	}
+
+	*result = q->result;
+	return TRUE;
+}
+
+void
+nv40_init_query_functions(struct nv40_context *nv40)
+{
+	nv40->pipe.create_query = nv40_query_create;
+	nv40->pipe.destroy_query = nv40_query_destroy;
+	nv40->pipe.begin_query = nv40_query_begin;
+	nv40->pipe.end_query = nv40_query_end;
+	nv40->pipe.get_query_result = nv40_query_result;
+}
diff --git a/src/gallium/drivers/nv40/nv40_screen.c b/src/gallium/drivers/nv40/nv40_screen.c
new file mode 100644
index 00000000000..ada0238511d
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_screen.c
@@ -0,0 +1,314 @@
+#include "pipe/p_screen.h"
+
+#include "nv40_context.h"
+#include "nv40_screen.h"
+
+#define NV4X_GRCLASS4097_CHIPSETS 0x00000baf
+#define NV4X_GRCLASS4497_CHIPSETS 0x00005450
+#define NV6X_GRCLASS4497_CHIPSETS 0x00000088
+
+static const char *
+nv40_screen_get_name(struct pipe_screen *pscreen)
+{
+	struct nv40_screen *screen = nv40_screen(pscreen);
+	struct nouveau_device *dev = screen->nvws->channel->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nv40_screen_get_vendor(struct pipe_screen *pscreen)
+{
+	return "nouveau";
+}
+
+static int
+nv40_screen_get_param(struct pipe_screen *pscreen, int param)
+{
+	struct nv40_screen *screen = nv40_screen(pscreen);
+
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 16;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 1;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 1;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 1;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 1;
+	case PIPE_CAP_POINT_SPRITE:
+		return 1;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 4;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 1;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 1;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 13;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 10;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 13;
+	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+	case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+		return 1;
+	case NOUVEAU_CAP_HW_VTXBUF:
+		return 1;
+	case NOUVEAU_CAP_HW_IDXBUF:
+		if (screen->curie->grclass == NV40TCL)
+			return 1;
+		return 0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv40_screen_get_paramf(struct pipe_screen *pscreen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 10.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 64.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 16.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 16.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static boolean
+nv40_screen_surface_format_supported(struct pipe_screen *pscreen,
+				     enum pipe_format format,
+				     enum pipe_texture_target target,
+				     unsigned tex_usage, unsigned geom_flags)
+{
+	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_A1R5G5B5_UNORM:
+		case PIPE_FORMAT_A4R4G4B4_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM:
+		case PIPE_FORMAT_R16_SNORM:
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+		case PIPE_FORMAT_A8L8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_DXT1_RGB:
+		case PIPE_FORMAT_DXT1_RGBA:
+		case PIPE_FORMAT_DXT3_RGBA:
+		case PIPE_FORMAT_DXT5_RGBA:
+			return TRUE;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static void *
+nv40_surface_map(struct pipe_screen *screen, struct pipe_surface *surface,
+		 unsigned flags )
+{
+	struct pipe_winsys *ws = screen->winsys;
+	void *map;
+
+	map = ws->buffer_map(ws, surface->buffer, flags);
+	if (!map)
+		return NULL;
+
+	return map + surface->offset;
+}
+
+static void
+nv40_surface_unmap(struct pipe_screen *screen, struct pipe_surface *surface)
+{
+	struct pipe_winsys *ws = screen->winsys;
+
+	ws->buffer_unmap(ws, surface->buffer);
+}
+
+static void
+nv40_screen_destroy(struct pipe_screen *pscreen)
+{
+	struct nv40_screen *screen = nv40_screen(pscreen);
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nvws->res_free(&screen->vp_exec_heap);
+	nvws->res_free(&screen->vp_data_heap);
+	nvws->res_free(&screen->query_heap);
+	nvws->notifier_free(&screen->query);
+	nvws->notifier_free(&screen->sync);
+	nvws->grobj_free(&screen->curie);
+
+	FREE(pscreen);
+}
+
+struct pipe_screen *
+nv40_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+{
+	struct nv40_screen *screen = CALLOC_STRUCT(nv40_screen);
+	struct nouveau_stateobj *so;
+	unsigned curie_class;
+	unsigned chipset = nvws->channel->device->chipset;
+	int ret;
+
+	if (!screen)
+		return NULL;
+	screen->nvws = nvws;
+
+	/* 3D object */
+	switch (chipset & 0xf0) {
+	case 0x40:
+		if (NV4X_GRCLASS4097_CHIPSETS & (1 << (chipset & 0x0f)))
+			curie_class = NV40TCL;
+		else
+		if (NV4X_GRCLASS4497_CHIPSETS & (1 << (chipset & 0x0f)))
+			curie_class = NV44TCL;
+		break;
+	case 0x60:
+		if (NV6X_GRCLASS4497_CHIPSETS & (1 << (chipset & 0x0f)))
+			curie_class = NV44TCL;
+		break;
+	default:
+		break;
+	}
+
+	if (!curie_class) {
+		NOUVEAU_ERR("Unknown nv4x chipset: nv%02x\n", chipset);
+		return NULL;
+	}
+
+	ret = nvws->grobj_alloc(nvws, curie_class, &screen->curie);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return FALSE;
+	}
+
+	/* Notifier for sync purposes */
+	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv40_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Query objects */
+	ret = nvws->notifier_alloc(nvws, 32, &screen->query);
+	if (ret) {
+		NOUVEAU_ERR("Error initialising query objects: %d\n", ret);
+		nv40_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	ret = nvws->res_init(&screen->query_heap, 0, 32);
+	if (ret) {
+		NOUVEAU_ERR("Error initialising query object heap: %d\n", ret);
+		nv40_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Vtxprog resources */
+	if (nvws->res_init(&screen->vp_exec_heap, 0, 512) ||
+	    nvws->res_init(&screen->vp_data_heap, 0, 256)) {
+		nv40_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Static curie initialisation */
+	so = so_new(128, 0);
+	so_method(so, screen->curie, NV40TCL_DMA_NOTIFY, 1);
+	so_data  (so, screen->sync->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_TEXTURE0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->gart->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_COLOR1, 1);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_COLOR0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_VTXBUF0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->gart->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_FENCE, 2);
+	so_data  (so, 0);
+	so_data  (so, screen->query->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_UNK01AC, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_COLOR2, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->vram->handle);
+
+	so_method(so, screen->curie, 0x1ea4, 3);
+	so_data  (so, 0x00000010);
+	so_data  (so, 0x01000100);
+	so_data  (so, 0xff800006);
+
+	/* vtxprog output routing */
+	so_method(so, screen->curie, 0x1fc4, 1);
+	so_data  (so, 0x06144321);
+	so_method(so, screen->curie, 0x1fc8, 2);
+	so_data  (so, 0xedcba987);
+	so_data  (so, 0x00000021);
+	so_method(so, screen->curie, 0x1fd0, 1);
+	so_data  (so, 0x00171615);
+	so_method(so, screen->curie, 0x1fd4, 1);
+	so_data  (so, 0x001b1a19);
+
+	so_method(so, screen->curie, 0x1ef8, 1);
+	so_data  (so, 0x0020ffff);
+	so_method(so, screen->curie, 0x1d64, 1);
+	so_data  (so, 0x00d30000);
+	so_method(so, screen->curie, 0x1e94, 1);
+	so_data  (so, 0x00000001);
+
+	so_emit(nvws, so);
+	so_ref(NULL, &so);
+	nvws->push_flush(nvws, 0, NULL);
+
+	screen->pipe.winsys = ws;
+	screen->pipe.destroy = nv40_screen_destroy;
+
+	screen->pipe.get_name = nv40_screen_get_name;
+	screen->pipe.get_vendor = nv40_screen_get_vendor;
+	screen->pipe.get_param = nv40_screen_get_param;
+	screen->pipe.get_paramf = nv40_screen_get_paramf;
+
+	screen->pipe.is_format_supported = nv40_screen_surface_format_supported;
+
+	screen->pipe.surface_map = nv40_surface_map;
+	screen->pipe.surface_unmap = nv40_surface_unmap;
+
+	nv40_screen_init_miptree_functions(&screen->pipe);
+
+	return &screen->pipe;
+}
+
diff --git a/src/gallium/drivers/nv40/nv40_screen.h b/src/gallium/drivers/nv40/nv40_screen.h
new file mode 100644
index 00000000000..c04a1275a00
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_screen.h
@@ -0,0 +1,35 @@
+#ifndef __NV40_SCREEN_H__
+#define __NV40_SCREEN_H__
+
+#include "pipe/p_screen.h"
+
+struct nv40_screen {
+	struct pipe_screen pipe;
+
+	struct nouveau_winsys *nvws;
+
+	unsigned cur_pctx;
+
+	/* HW graphics objects */
+	struct nouveau_grobj *curie;
+	struct nouveau_notifier *sync;
+
+	/* Query object resources */
+	struct nouveau_notifier *query;
+	struct nouveau_resource *query_heap;
+
+	/* Vtxprog resources */
+	struct nouveau_resource *vp_exec_heap;
+	struct nouveau_resource *vp_data_heap;
+
+	/* Current 3D state of channel */
+	struct nouveau_stateobj *state[NV40_STATE_MAX];
+};
+
+static INLINE struct nv40_screen *
+nv40_screen(struct pipe_screen *screen)
+{
+	return (struct nv40_screen *)screen;
+}
+
+#endif
diff --git a/src/gallium/drivers/nv40/nv40_shader.h b/src/gallium/drivers/nv40/nv40_shader.h
new file mode 100644
index 00000000000..854dccf5486
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_shader.h
@@ -0,0 +1,556 @@
+#ifndef __NV40_SHADER_H__
+#define __NV40_SHADER_H__
+
+/* Vertex programs instruction set
+ *
+ * The NV40 instruction set is very similar to NV30.  Most fields are in
+ * a slightly different position in the instruction however.
+ *
+ * Merged instructions
+ *     In some cases it is possible to put two instructions into one opcode
+ *     slot.  The rules for when this is OK is not entirely clear to me yet.
+ *
+ *     There are separate writemasks and dest temp register fields for each
+ *     grouping of instructions.  There is however only one field with the
+ *     ID of a result register.  Writing to temp/result regs is selected by
+ *     setting VEC_RESULT/SCA_RESULT.
+ *
+ * Temporary registers
+ *     The source/dest temp register fields have been extended by 1 bit, to
+ *     give a total of 32 temporary registers.
+ *
+ * Relative Addressing
+ *     NV40 can use an address register to index into vertex attribute regs.
+ *     This is done by putting the offset value into INPUT_SRC and setting
+ *     the INDEX_INPUT flag.
+ *
+ * Conditional execution (see NV_vertex_program{2,3} for details)
+ *     There is a second condition code register on NV40, it's use is enabled
+ *     by setting the COND_REG_SELECT_1 flag.
+ *
+ * Texture lookup
+ *     TODO
+ */
+
+/* ---- OPCODE BITS 127:96 / data DWORD 0 --- */
+#define NV40_VP_INST_VEC_RESULT                                        (1 << 30)
+/* uncertain.. */
+#define NV40_VP_INST_COND_UPDATE_ENABLE                        ((1 << 14)|1<<29)
+/* use address reg as index into attribs */
+#define NV40_VP_INST_INDEX_INPUT                                       (1 << 27)
+#define NV40_VP_INST_COND_REG_SELECT_1                                 (1 << 25)
+#define NV40_VP_INST_ADDR_REG_SELECT_1                                 (1 << 24)
+#define NV40_VP_INST_SRC2_ABS                                          (1 << 23)
+#define NV40_VP_INST_SRC1_ABS                                          (1 << 22)
+#define NV40_VP_INST_SRC0_ABS                                          (1 << 21)
+#define NV40_VP_INST_VEC_DEST_TEMP_SHIFT                                      15
+#define NV40_VP_INST_VEC_DEST_TEMP_MASK                             (0x1F << 15)
+#define NV40_VP_INST_COND_TEST_ENABLE                                  (1 << 13)
+#define NV40_VP_INST_COND_SHIFT                                               10
+#define NV40_VP_INST_COND_MASK                                       (0x7 << 10)
+#    define NV40_VP_INST_COND_FL                                               0
+#    define NV40_VP_INST_COND_LT                                               1
+#    define NV40_VP_INST_COND_EQ                                               2
+#    define NV40_VP_INST_COND_LE                                               3
+#    define NV40_VP_INST_COND_GT                                               4
+#    define NV40_VP_INST_COND_NE                                               5
+#    define NV40_VP_INST_COND_GE                                               6
+#    define NV40_VP_INST_COND_TR                                               7
+#define NV40_VP_INST_COND_SWZ_X_SHIFT                                          8
+#define NV40_VP_INST_COND_SWZ_X_MASK                                    (3 << 8)
+#define NV40_VP_INST_COND_SWZ_Y_SHIFT                                          6
+#define NV40_VP_INST_COND_SWZ_Y_MASK                                    (3 << 6)
+#define NV40_VP_INST_COND_SWZ_Z_SHIFT                                          4
+#define NV40_VP_INST_COND_SWZ_Z_MASK                                    (3 << 4)
+#define NV40_VP_INST_COND_SWZ_W_SHIFT                                          2
+#define NV40_VP_INST_COND_SWZ_W_MASK                                    (3 << 2)
+#define NV40_VP_INST_COND_SWZ_ALL_SHIFT                                        2
+#define NV40_VP_INST_COND_SWZ_ALL_MASK                               (0xFF << 2)
+#define NV40_VP_INST_ADDR_SWZ_SHIFT                                            0
+#define NV40_VP_INST_ADDR_SWZ_MASK                                   (0x03 << 0)
+#define NV40_VP_INST0_KNOWN ( \
+                NV40_VP_INST_INDEX_INPUT | \
+                NV40_VP_INST_COND_REG_SELECT_1 | \
+                NV40_VP_INST_ADDR_REG_SELECT_1 | \
+                NV40_VP_INST_SRC2_ABS | \
+                NV40_VP_INST_SRC1_ABS | \
+                NV40_VP_INST_SRC0_ABS | \
+                NV40_VP_INST_VEC_DEST_TEMP_MASK | \
+                NV40_VP_INST_COND_TEST_ENABLE | \
+                NV40_VP_INST_COND_MASK | \
+                NV40_VP_INST_COND_SWZ_ALL_MASK | \
+                NV40_VP_INST_ADDR_SWZ_MASK)
+
+/* ---- OPCODE BITS 95:64 / data DWORD 1 --- */
+#define NV40_VP_INST_VEC_OPCODE_SHIFT                                         22
+#define NV40_VP_INST_VEC_OPCODE_MASK                                (0x1F << 22)
+#    define NV40_VP_INST_OP_NOP                                             0x00
+#    define NV40_VP_INST_OP_MOV                                             0x01
+#    define NV40_VP_INST_OP_MUL                                             0x02
+#    define NV40_VP_INST_OP_ADD                                             0x03
+#    define NV40_VP_INST_OP_MAD                                             0x04
+#    define NV40_VP_INST_OP_DP3                                             0x05
+#    define NV40_VP_INST_OP_DPH                                             0x06
+#    define NV40_VP_INST_OP_DP4                                             0x07
+#    define NV40_VP_INST_OP_DST                                             0x08
+#    define NV40_VP_INST_OP_MIN                                             0x09
+#    define NV40_VP_INST_OP_MAX                                             0x0A
+#    define NV40_VP_INST_OP_SLT                                             0x0B
+#    define NV40_VP_INST_OP_SGE                                             0x0C
+#    define NV40_VP_INST_OP_ARL                                             0x0D
+#    define NV40_VP_INST_OP_FRC                                             0x0E
+#    define NV40_VP_INST_OP_FLR                                             0x0F
+#    define NV40_VP_INST_OP_SEQ                                             0x10
+#    define NV40_VP_INST_OP_SFL                                             0x11
+#    define NV40_VP_INST_OP_SGT                                             0x12
+#    define NV40_VP_INST_OP_SLE                                             0x13
+#    define NV40_VP_INST_OP_SNE                                             0x14
+#    define NV40_VP_INST_OP_STR                                             0x15
+#    define NV40_VP_INST_OP_SSG                                             0x16
+#    define NV40_VP_INST_OP_ARR                                             0x17
+#    define NV40_VP_INST_OP_ARA                                             0x18
+#    define NV40_VP_INST_OP_TXL                                             0x19
+#define NV40_VP_INST_SCA_OPCODE_SHIFT                                         27
+#define NV40_VP_INST_SCA_OPCODE_MASK                                (0x1F << 27)
+#    define NV40_VP_INST_OP_NOP                                             0x00
+#    define NV40_VP_INST_OP_MOV                                             0x01
+#    define NV40_VP_INST_OP_RCP                                             0x02
+#    define NV40_VP_INST_OP_RCC                                             0x03
+#    define NV40_VP_INST_OP_RSQ                                             0x04
+#    define NV40_VP_INST_OP_EXP                                             0x05
+#    define NV40_VP_INST_OP_LOG                                             0x06
+#    define NV40_VP_INST_OP_LIT                                             0x07
+#    define NV40_VP_INST_OP_BRA                                             0x09
+#    define NV40_VP_INST_OP_CAL                                             0x0B
+#    define NV40_VP_INST_OP_RET                                             0x0C
+#    define NV40_VP_INST_OP_LG2                                             0x0D
+#    define NV40_VP_INST_OP_EX2                                             0x0E
+#    define NV40_VP_INST_OP_SIN                                             0x0F
+#    define NV40_VP_INST_OP_COS                                             0x10
+#    define NV40_VP_INST_OP_PUSHA                                           0x13
+#    define NV40_VP_INST_OP_POPA                                            0x14
+#define NV40_VP_INST_CONST_SRC_SHIFT                                          12
+#define NV40_VP_INST_CONST_SRC_MASK                                 (0xFF << 12)
+#define NV40_VP_INST_INPUT_SRC_SHIFT                                           8
+#define NV40_VP_INST_INPUT_SRC_MASK                                  (0x0F << 8)
+#    define NV40_VP_INST_IN_POS                                                0
+#    define NV40_VP_INST_IN_WEIGHT                                             1
+#    define NV40_VP_INST_IN_NORMAL                                             2
+#    define NV40_VP_INST_IN_COL0                                               3
+#    define NV40_VP_INST_IN_COL1                                               4
+#    define NV40_VP_INST_IN_FOGC                                               5
+#    define NV40_VP_INST_IN_TC0                                                8
+#    define NV40_VP_INST_IN_TC(n)                                          (8+n)
+#define NV40_VP_INST_SRC0H_SHIFT                                               0
+#define NV40_VP_INST_SRC0H_MASK                                      (0xFF << 0)
+#define NV40_VP_INST1_KNOWN ( \
+                NV40_VP_INST_VEC_OPCODE_MASK | \
+                NV40_VP_INST_SCA_OPCODE_MASK | \
+                NV40_VP_INST_CONST_SRC_MASK  | \
+                NV40_VP_INST_INPUT_SRC_MASK  | \
+                NV40_VP_INST_SRC0H_MASK \
+                )
+
+/* ---- OPCODE BITS 63:32 / data DWORD 2 --- */
+#define NV40_VP_INST_SRC0L_SHIFT                                              23
+#define NV40_VP_INST_SRC0L_MASK                                    (0x1FF << 23)
+#define NV40_VP_INST_SRC1_SHIFT                                                6
+#define NV40_VP_INST_SRC1_MASK                                    (0x1FFFF << 6)
+#define NV40_VP_INST_SRC2H_SHIFT                                               0
+#define NV40_VP_INST_SRC2H_MASK                                      (0x3F << 0)
+#define NV40_VP_INST_IADDRH_SHIFT                                              0
+#define NV40_VP_INST_IADDRH_MASK                                     (0x1F << 0)
+
+/* ---- OPCODE BITS 31:0 / data DWORD 3 --- */
+#define NV40_VP_INST_IADDRL_SHIFT                                             29
+#define NV40_VP_INST_IADDRL_MASK                                       (7 << 29)
+#define NV40_VP_INST_SRC2L_SHIFT                                              21
+#define NV40_VP_INST_SRC2L_MASK                                    (0x7FF << 21)
+#define NV40_VP_INST_SCA_WRITEMASK_SHIFT                                      17
+#define NV40_VP_INST_SCA_WRITEMASK_MASK                              (0xF << 17)
+#    define NV40_VP_INST_SCA_WRITEMASK_X                               (1 << 20)
+#    define NV40_VP_INST_SCA_WRITEMASK_Y                               (1 << 19)
+#    define NV40_VP_INST_SCA_WRITEMASK_Z                               (1 << 18)
+#    define NV40_VP_INST_SCA_WRITEMASK_W                               (1 << 17)
+#define NV40_VP_INST_VEC_WRITEMASK_SHIFT                                      13
+#define NV40_VP_INST_VEC_WRITEMASK_MASK                              (0xF << 13)
+#    define NV40_VP_INST_VEC_WRITEMASK_X                               (1 << 16)
+#    define NV40_VP_INST_VEC_WRITEMASK_Y                               (1 << 15)
+#    define NV40_VP_INST_VEC_WRITEMASK_Z                               (1 << 14)
+#    define NV40_VP_INST_VEC_WRITEMASK_W                               (1 << 13)
+#define NV40_VP_INST_SCA_RESULT                                        (1 << 12)
+#define NV40_VP_INST_SCA_DEST_TEMP_SHIFT                                       7
+#define NV40_VP_INST_SCA_DEST_TEMP_MASK                              (0x1F << 7)
+#define NV40_VP_INST_DEST_SHIFT                                                2
+#define NV40_VP_INST_DEST_MASK                                         (31 << 2)
+#    define NV40_VP_INST_DEST_POS                                              0
+#    define NV40_VP_INST_DEST_COL0                                             1
+#    define NV40_VP_INST_DEST_COL1                                             2
+#    define NV40_VP_INST_DEST_BFC0                                             3
+#    define NV40_VP_INST_DEST_BFC1                                             4
+#    define NV40_VP_INST_DEST_FOGC                                             5
+#    define NV40_VP_INST_DEST_PSZ                                              6
+#    define NV40_VP_INST_DEST_TC0                                              7
+#    define NV40_VP_INST_DEST_TC(n)                                        (7+n)
+#    define NV40_VP_INST_DEST_TEMP                                          0x1F
+#define NV40_VP_INST_INDEX_CONST                                        (1 << 1)
+#define NV40_VP_INST_LAST                                               (1 << 0)
+#define NV40_VP_INST3_KNOWN ( \
+                NV40_VP_INST_SRC2L_MASK |\
+                NV40_VP_INST_SCA_WRITEMASK_MASK |\
+                NV40_VP_INST_VEC_WRITEMASK_MASK |\
+                NV40_VP_INST_SCA_DEST_TEMP_MASK |\
+                NV40_VP_INST_DEST_MASK |\
+                NV40_VP_INST_INDEX_CONST)
+
+/* Useful to split the source selection regs into their pieces */
+#define NV40_VP_SRC0_HIGH_SHIFT                                                9
+#define NV40_VP_SRC0_HIGH_MASK                                        0x0001FE00
+#define NV40_VP_SRC0_LOW_MASK                                         0x000001FF
+#define NV40_VP_SRC2_HIGH_SHIFT                                               11
+#define NV40_VP_SRC2_HIGH_MASK                                        0x0001F800
+#define NV40_VP_SRC2_LOW_MASK                                         0x000007FF
+
+/* Source selection - these are the bits you fill NV40_VP_INST_SRCn with */
+#define NV40_VP_SRC_NEGATE                                             (1 << 16)
+#define NV40_VP_SRC_SWZ_X_SHIFT                                               14
+#define NV40_VP_SRC_SWZ_X_MASK                                         (3 << 14)
+#define NV40_VP_SRC_SWZ_Y_SHIFT                                               12
+#define NV40_VP_SRC_SWZ_Y_MASK                                         (3 << 12)
+#define NV40_VP_SRC_SWZ_Z_SHIFT                                               10
+#define NV40_VP_SRC_SWZ_Z_MASK                                         (3 << 10)
+#define NV40_VP_SRC_SWZ_W_SHIFT                                                8
+#define NV40_VP_SRC_SWZ_W_MASK                                          (3 << 8)
+#define NV40_VP_SRC_SWZ_ALL_SHIFT                                              8
+#define NV40_VP_SRC_SWZ_ALL_MASK                                     (0xFF << 8)
+#define NV40_VP_SRC_TEMP_SRC_SHIFT                                             2
+#define NV40_VP_SRC_TEMP_SRC_MASK                                    (0x1F << 2)
+#define NV40_VP_SRC_REG_TYPE_SHIFT                                             0
+#define NV40_VP_SRC_REG_TYPE_MASK                                       (3 << 0)
+#    define NV40_VP_SRC_REG_TYPE_UNK0                                          0
+#    define NV40_VP_SRC_REG_TYPE_TEMP                                          1
+#    define NV40_VP_SRC_REG_TYPE_INPUT                                         2
+#    define NV40_VP_SRC_REG_TYPE_CONST                                         3
+
+
+/*
+ * Each fragment program opcode appears to be comprised of 4 32-bit values.
+ *
+ *         0 - Opcode, output reg/mask, ATTRIB source
+ *         1 - Source 0
+ *         2 - Source 1
+ *         3 - Source 2
+ *
+ * There appears to be no special difference between result regs and temp regs.
+ *                 result.color == R0.xyzw
+ *                 result.depth == R1.z
+ * When the fragprog contains instructions to write depth,
+ * NV30_TCL_PRIMITIVE_3D_UNK1D78=0 otherwise it is set to 1.
+ *
+ * Constants are inserted directly after the instruction that uses them.
+ * 
+ * It appears that it's not possible to use two input registers in one
+ * instruction as the input sourcing is done in the instruction dword
+ * and not the source selection dwords.  As such instructions such as:
+ * 
+ *                 ADD result.color, fragment.color, fragment.texcoord[0];
+ *
+ * must be split into two MOV's and then an ADD (nvidia does this) but
+ * I'm not sure why it's not just one MOV and then source the second input
+ * in the ADD instruction..
+ *
+ * Negation of the full source is done with NV30_FP_REG_NEGATE, arbitrary
+ * negation requires multiplication with a const.
+ *
+ * Arbitrary swizzling is supported with the exception of SWIZZLE_ZERO and
+ * SWIZZLE_ONE.
+ *
+ * The temp/result regs appear to be initialised to (0.0, 0.0, 0.0, 0.0) as
+ * SWIZZLE_ZERO is implemented simply by not writing to the relevant components
+ * of the destination.
+ *
+ * Looping
+ *   Loops appear to be fairly expensive on NV40 at least, the proprietary
+ *   driver goes to a lot of effort to avoid using the native looping
+ *   instructions.  If the total number of *executed* instructions between
+ *   REP/ENDREP or LOOP/ENDLOOP is <=500, the driver will unroll the loop.
+ *   The maximum loop count is 255.
+ *
+ * Conditional execution
+ *   TODO
+ * 
+ * Non-native instructions:
+ *         LIT
+ *         LRP - MAD+MAD
+ *         SUB - ADD, negate second source
+ *         RSQ - LG2 + EX2
+ *         POW - LG2 + MUL + EX2
+ *         SCS - COS + SIN
+ *         XPD
+ *         DP2 - MUL + ADD
+ *         NRM
+ */
+
+//== Opcode / Destination selection ==
+#define NV40_FP_OP_PROGRAM_END                                          (1 << 0)
+#define NV40_FP_OP_OUT_REG_SHIFT                                               1
+#define NV40_FP_OP_OUT_REG_MASK                                        (63 << 1)
+/* Needs to be set when writing outputs to get expected result.. */
+#define NV40_FP_OP_OUT_REG_HALF                                         (1 << 7)
+#define NV40_FP_OP_COND_WRITE_ENABLE                                    (1 << 8)
+#define NV40_FP_OP_OUTMASK_SHIFT                                               9
+#define NV40_FP_OP_OUTMASK_MASK                                       (0xF << 9)
+#    define NV40_FP_OP_OUT_X                                            (1 << 9)
+#    define NV40_FP_OP_OUT_Y                                            (1 <<10)
+#    define NV40_FP_OP_OUT_Z                                            (1 <<11)
+#    define NV40_FP_OP_OUT_W                                            (1 <<12)
+/* Uncertain about these, especially the input_src values.. it's possible that
+ * they can be dynamically changed.
+ */
+#define NV40_FP_OP_INPUT_SRC_SHIFT                                            13
+#define NV40_FP_OP_INPUT_SRC_MASK                                     (15 << 13)
+#    define NV40_FP_OP_INPUT_SRC_POSITION                                    0x0
+#    define NV40_FP_OP_INPUT_SRC_COL0                                        0x1
+#    define NV40_FP_OP_INPUT_SRC_COL1                                        0x2
+#    define NV40_FP_OP_INPUT_SRC_FOGC                                        0x3
+#    define NV40_FP_OP_INPUT_SRC_TC0                                         0x4
+#    define NV40_FP_OP_INPUT_SRC_TC(n)                                 (0x4 + n)
+#    define NV40_FP_OP_INPUT_SRC_FACING                                      0xE
+#define NV40_FP_OP_TEX_UNIT_SHIFT                                             17
+#define NV40_FP_OP_TEX_UNIT_MASK                                     (0xF << 17)
+#define NV40_FP_OP_PRECISION_SHIFT                                            22
+#define NV40_FP_OP_PRECISION_MASK                                      (3 << 22)
+#   define NV40_FP_PRECISION_FP32                                              0
+#   define NV40_FP_PRECISION_FP16                                              1
+#   define NV40_FP_PRECISION_FX12                                              2
+#define NV40_FP_OP_OPCODE_SHIFT                                               24
+#define NV40_FP_OP_OPCODE_MASK                                      (0x3F << 24)
+#        define NV40_FP_OP_OPCODE_NOP                                       0x00
+#        define NV40_FP_OP_OPCODE_MOV                                       0x01
+#        define NV40_FP_OP_OPCODE_MUL                                       0x02
+#        define NV40_FP_OP_OPCODE_ADD                                       0x03
+#        define NV40_FP_OP_OPCODE_MAD                                       0x04
+#        define NV40_FP_OP_OPCODE_DP3                                       0x05
+#        define NV40_FP_OP_OPCODE_DP4                                       0x06
+#        define NV40_FP_OP_OPCODE_DST                                       0x07
+#        define NV40_FP_OP_OPCODE_MIN                                       0x08
+#        define NV40_FP_OP_OPCODE_MAX                                       0x09
+#        define NV40_FP_OP_OPCODE_SLT                                       0x0A
+#        define NV40_FP_OP_OPCODE_SGE                                       0x0B
+#        define NV40_FP_OP_OPCODE_SLE                                       0x0C
+#        define NV40_FP_OP_OPCODE_SGT                                       0x0D
+#        define NV40_FP_OP_OPCODE_SNE                                       0x0E
+#        define NV40_FP_OP_OPCODE_SEQ                                       0x0F
+#        define NV40_FP_OP_OPCODE_FRC                                       0x10
+#        define NV40_FP_OP_OPCODE_FLR                                       0x11
+#        define NV40_FP_OP_OPCODE_KIL                                       0x12
+#        define NV40_FP_OP_OPCODE_PK4B                                      0x13
+#        define NV40_FP_OP_OPCODE_UP4B                                      0x14
+/* DDX/DDY can only write to XY */
+#        define NV40_FP_OP_OPCODE_DDX                                       0x15
+#        define NV40_FP_OP_OPCODE_DDY                                       0x16
+#        define NV40_FP_OP_OPCODE_TEX                                       0x17
+#        define NV40_FP_OP_OPCODE_TXP                                       0x18
+#        define NV40_FP_OP_OPCODE_TXD                                       0x19
+#        define NV40_FP_OP_OPCODE_RCP                                       0x1A
+#        define NV40_FP_OP_OPCODE_EX2                                       0x1C
+#        define NV40_FP_OP_OPCODE_LG2                                       0x1D
+#        define NV40_FP_OP_OPCODE_STR                                       0x20
+#        define NV40_FP_OP_OPCODE_SFL                                       0x21
+#        define NV40_FP_OP_OPCODE_COS                                       0x22
+#        define NV40_FP_OP_OPCODE_SIN                                       0x23
+#        define NV40_FP_OP_OPCODE_PK2H                                      0x24
+#        define NV40_FP_OP_OPCODE_UP2H                                      0x25
+#        define NV40_FP_OP_OPCODE_PK4UB                                     0x27
+#        define NV40_FP_OP_OPCODE_UP4UB                                     0x28
+#        define NV40_FP_OP_OPCODE_PK2US                                     0x29
+#        define NV40_FP_OP_OPCODE_UP2US                                     0x2A
+#        define NV40_FP_OP_OPCODE_DP2A                                      0x2E
+#        define NV40_FP_OP_OPCODE_TXL                                       0x2F
+#        define NV40_FP_OP_OPCODE_TXB                                       0x31
+#        define NV40_FP_OP_OPCODE_DIV                                       0x3A
+#        define NV40_FP_OP_OPCODE_UNK_LIT                                   0x3C
+/* The use of these instructions appears to be indicated by bit 31 of DWORD 2.*/
+#        define NV40_FP_OP_BRA_OPCODE_BRK                                    0x0
+#        define NV40_FP_OP_BRA_OPCODE_CAL                                    0x1
+#        define NV40_FP_OP_BRA_OPCODE_IF                                     0x2
+#        define NV40_FP_OP_BRA_OPCODE_LOOP                                   0x3
+#        define NV40_FP_OP_BRA_OPCODE_REP                                    0x4
+#        define NV40_FP_OP_BRA_OPCODE_RET                                    0x5
+#define NV40_FP_OP_OUT_SAT                                             (1 << 31)
+
+/* high order bits of SRC0 */
+#define NV40_FP_OP_OUT_ABS                                             (1 << 29)
+#define NV40_FP_OP_COND_SWZ_W_SHIFT                                           27
+#define NV40_FP_OP_COND_SWZ_W_MASK                                     (3 << 27)
+#define NV40_FP_OP_COND_SWZ_Z_SHIFT                                           25
+#define NV40_FP_OP_COND_SWZ_Z_MASK                                     (3 << 25)
+#define NV40_FP_OP_COND_SWZ_Y_SHIFT                                           23
+#define NV40_FP_OP_COND_SWZ_Y_MASK                                     (3 << 23)
+#define NV40_FP_OP_COND_SWZ_X_SHIFT                                           21
+#define NV40_FP_OP_COND_SWZ_X_MASK                                     (3 << 21)
+#define NV40_FP_OP_COND_SWZ_ALL_SHIFT                                         21
+#define NV40_FP_OP_COND_SWZ_ALL_MASK                                (0xFF << 21)
+#define NV40_FP_OP_COND_SHIFT                                                 18
+#define NV40_FP_OP_COND_MASK                                        (0x07 << 18)
+#        define NV40_FP_OP_COND_FL                                             0
+#        define NV40_FP_OP_COND_LT                                             1
+#        define NV40_FP_OP_COND_EQ                                             2
+#        define NV40_FP_OP_COND_LE                                             3
+#        define NV40_FP_OP_COND_GT                                             4
+#        define NV40_FP_OP_COND_NE                                             5
+#        define NV40_FP_OP_COND_GE                                             6
+#        define NV40_FP_OP_COND_TR                                             7
+
+/* high order bits of SRC1 */
+#define NV40_FP_OP_OPCODE_IS_BRANCH                                      (1<<31)
+#define NV40_FP_OP_DST_SCALE_SHIFT                                            28
+#define NV40_FP_OP_DST_SCALE_MASK                                      (3 << 28)
+#define NV40_FP_OP_DST_SCALE_1X                                                0
+#define NV40_FP_OP_DST_SCALE_2X                                                1
+#define NV40_FP_OP_DST_SCALE_4X                                                2
+#define NV40_FP_OP_DST_SCALE_8X                                                3
+#define NV40_FP_OP_DST_SCALE_INV_2X                                            5
+#define NV40_FP_OP_DST_SCALE_INV_4X                                            6
+#define NV40_FP_OP_DST_SCALE_INV_8X                                            7
+
+/* SRC1 LOOP */
+#define NV40_FP_OP_LOOP_INCR_SHIFT                                            19
+#define NV40_FP_OP_LOOP_INCR_MASK                                   (0xFF << 19)
+#define NV40_FP_OP_LOOP_INDEX_SHIFT                                           10
+#define NV40_FP_OP_LOOP_INDEX_MASK                                  (0xFF << 10)
+#define NV40_FP_OP_LOOP_COUNT_SHIFT                                            2
+#define NV40_FP_OP_LOOP_COUNT_MASK                                   (0xFF << 2)
+
+/* SRC1 IF */
+#define NV40_FP_OP_ELSE_ID_SHIFT                                               2
+#define NV40_FP_OP_ELSE_ID_MASK                                      (0xFF << 2)
+
+/* SRC1 CAL */
+#define NV40_FP_OP_IADDR_SHIFT                                                 2
+#define NV40_FP_OP_IADDR_MASK                                        (0xFF << 2)
+
+/* SRC1 REP
+ *   I have no idea why there are 3 count values here..  but they
+ *   have always been filled with the same value in my tests so
+ *   far..
+ */
+#define NV40_FP_OP_REP_COUNT1_SHIFT                                            2
+#define NV40_FP_OP_REP_COUNT1_MASK                                   (0xFF << 2)
+#define NV40_FP_OP_REP_COUNT2_SHIFT                                           10
+#define NV40_FP_OP_REP_COUNT2_MASK                                  (0xFF << 10)
+#define NV40_FP_OP_REP_COUNT3_SHIFT                                           19
+#define NV40_FP_OP_REP_COUNT3_MASK                                  (0xFF << 19)
+
+/* SRC2 REP/IF */
+#define NV40_FP_OP_END_ID_SHIFT                                                2
+#define NV40_FP_OP_END_ID_MASK                                       (0xFF << 2)
+
+// SRC2 high-order
+#define NV40_FP_OP_INDEX_INPUT                                         (1 << 30)
+#define NV40_FP_OP_ADDR_INDEX_SHIFT                                           19
+#define NV40_FP_OP_ADDR_INDEX_MASK                                   (0xF << 19)
+
+//== Register selection ==
+#define NV40_FP_REG_TYPE_SHIFT                                                 0
+#define NV40_FP_REG_TYPE_MASK                                           (3 << 0)
+#        define NV40_FP_REG_TYPE_TEMP                                          0
+#        define NV40_FP_REG_TYPE_INPUT                                         1
+#        define NV40_FP_REG_TYPE_CONST                                         2
+#define NV40_FP_REG_SRC_SHIFT                                                  2
+#define NV40_FP_REG_SRC_MASK                                           (63 << 2)
+#define NV40_FP_REG_SRC_HALF                                            (1 << 8)
+#define NV40_FP_REG_SWZ_ALL_SHIFT                                              9
+#define NV40_FP_REG_SWZ_ALL_MASK                                      (255 << 9)
+#define NV40_FP_REG_SWZ_X_SHIFT                                                9
+#define NV40_FP_REG_SWZ_X_MASK                                          (3 << 9)
+#define NV40_FP_REG_SWZ_Y_SHIFT                                               11
+#define NV40_FP_REG_SWZ_Y_MASK                                         (3 << 11)
+#define NV40_FP_REG_SWZ_Z_SHIFT                                               13
+#define NV40_FP_REG_SWZ_Z_MASK                                         (3 << 13)
+#define NV40_FP_REG_SWZ_W_SHIFT                                               15
+#define NV40_FP_REG_SWZ_W_MASK                                         (3 << 15)
+#        define NV40_FP_SWIZZLE_X                                              0
+#        define NV40_FP_SWIZZLE_Y                                              1
+#        define NV40_FP_SWIZZLE_Z                                              2
+#        define NV40_FP_SWIZZLE_W                                              3
+#define NV40_FP_REG_NEGATE                                             (1 << 17)
+
+#ifndef NV40_SHADER_NO_FUCKEDNESS
+#define NV40SR_NONE	0
+#define NV40SR_OUTPUT	1
+#define NV40SR_INPUT	2
+#define NV40SR_TEMP	3
+#define NV40SR_CONST	4
+
+struct nv40_sreg {
+	int type;
+	int index;
+
+	int dst_scale;
+
+	int negate;
+	int abs;
+	int swz[4];
+
+	int cc_update;
+	int cc_update_reg;
+	int cc_test;
+	int cc_test_reg;
+	int cc_swz[4];
+};
+
+static INLINE struct nv40_sreg
+nv40_sr(int type, int index)
+{
+	struct nv40_sreg temp = {
+		.type = type,
+		.index = index,
+		.dst_scale = DEF_SCALE,
+		.abs = 0,
+		.negate = 0,
+		.swz = { 0, 1, 2, 3 },
+		.cc_update = 0,
+		.cc_update_reg = 0,
+		.cc_test = DEF_CTEST,
+		.cc_test_reg = 0,
+		.cc_swz = { 0, 1, 2, 3 },
+	};
+	return temp;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_swz(struct nv40_sreg src, int x, int y, int z, int w)
+{
+	struct nv40_sreg dst = src;
+
+	dst.swz[SWZ_X] = src.swz[x];
+	dst.swz[SWZ_Y] = src.swz[y];
+	dst.swz[SWZ_Z] = src.swz[z];
+	dst.swz[SWZ_W] = src.swz[w];
+	return dst;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_neg(struct nv40_sreg src)
+{
+	src.negate = !src.negate;
+	return src;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_abs(struct nv40_sreg src)
+{
+	src.abs = 1;
+	return src;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_scale(struct nv40_sreg src, int scale)
+{
+	src.dst_scale = scale;
+	return src;
+}
+#endif
+
+#endif
diff --git a/src/gallium/drivers/nv40/nv40_state.c b/src/gallium/drivers/nv40/nv40_state.c
new file mode 100644
index 00000000000..255c4b294d1
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state.c
@@ -0,0 +1,740 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "draw/draw_context.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv40_context.h"
+#include "nv40_state.h"
+
+static void *
+nv40_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_grobj *curie = nv40->screen->curie;
+	struct nv40_blend_state *bso = CALLOC(1, sizeof(*bso));
+	struct nouveau_stateobj *so = so_new(16, 0);
+
+	if (cso->blend_enable) {
+		so_method(so, curie, NV40TCL_BLEND_ENABLE, 3);
+		so_data  (so, 1);
+		so_data  (so, (nvgl_blend_func(cso->alpha_src_factor) << 16) |
+			       nvgl_blend_func(cso->rgb_src_factor));
+		so_data  (so, nvgl_blend_func(cso->alpha_dst_factor) << 16 |
+			      nvgl_blend_func(cso->rgb_dst_factor));
+		so_method(so, curie, NV40TCL_BLEND_EQUATION, 1);
+		so_data  (so, nvgl_blend_eqn(cso->alpha_func) << 16 |
+			      nvgl_blend_eqn(cso->rgb_func));
+	} else {
+		so_method(so, curie, NV40TCL_BLEND_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, curie, NV40TCL_COLOR_MASK, 1);
+	so_data  (so, (((cso->colormask & PIPE_MASK_A) ? (0x01 << 24) : 0) |
+		       ((cso->colormask & PIPE_MASK_R) ? (0x01 << 16) : 0) |
+		       ((cso->colormask & PIPE_MASK_G) ? (0x01 <<  8) : 0) |
+		       ((cso->colormask & PIPE_MASK_B) ? (0x01 <<  0) : 0)));
+
+	if (cso->logicop_enable) {
+		so_method(so, curie, NV40TCL_COLOR_LOGIC_OP_ENABLE, 2);
+		so_data  (so, 1);
+		so_data  (so, nvgl_logicop_func(cso->logicop_func));
+	} else {
+		so_method(so, curie, NV40TCL_COLOR_LOGIC_OP_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, curie, NV40TCL_DITHER_ENABLE, 1);
+	so_data  (so, cso->dither ? 1 : 0);
+
+	so_ref(so, &bso->so);
+	bso->pipe = *cso;
+	return (void *)bso;
+}
+
+static void
+nv40_blend_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->blend = hwcso;
+	nv40->dirty |= NV40_NEW_BLEND;
+}
+
+static void
+nv40_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_blend_state *bso = hwcso;
+
+	so_ref(NULL, &bso->so);
+	FREE(bso);
+}
+
+
+static INLINE unsigned
+wrap_mode(unsigned wrap) {
+	unsigned ret;
+
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		ret = NV40TCL_TEX_WRAP_S_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		ret = NV40TCL_TEX_WRAP_S_MIRRORED_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		ret = NV40TCL_TEX_WRAP_S_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		ret = NV40TCL_TEX_WRAP_S_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_CLAMP:
+		ret = NV40TCL_TEX_WRAP_S_CLAMP;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+		ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+		ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+		ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP;
+		break;
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		ret = NV40TCL_TEX_WRAP_S_REPEAT;
+		break;
+	}
+
+	return ret >> NV40TCL_TEX_WRAP_S_SHIFT;
+}
+
+static void *
+nv40_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	struct nv40_sampler_state *ps;
+	uint32_t filter = 0;
+
+	ps = MALLOC(sizeof(struct nv40_sampler_state));
+
+	ps->fmt = 0;
+	if (!cso->normalized_coords)
+		ps->fmt |= NV40TCL_TEX_FORMAT_RECT;
+
+	ps->wrap = ((wrap_mode(cso->wrap_s) << NV40TCL_TEX_WRAP_S_SHIFT) |
+		    (wrap_mode(cso->wrap_t) << NV40TCL_TEX_WRAP_T_SHIFT) |
+		    (wrap_mode(cso->wrap_r) << NV40TCL_TEX_WRAP_R_SHIFT));
+
+	ps->en = 0;
+	if (cso->max_anisotropy >= 2.0) {
+		/* no idea, binary driver sets it, works without it.. meh.. */
+		ps->wrap |= (1 << 5);
+
+		if (cso->max_anisotropy >= 16.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_16X;
+		} else
+		if (cso->max_anisotropy >= 12.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_12X;
+		} else
+		if (cso->max_anisotropy >= 10.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_10X;
+		} else
+		if (cso->max_anisotropy >= 8.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_8X;
+		} else
+		if (cso->max_anisotropy >= 6.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_6X;
+		} else
+		if (cso->max_anisotropy >= 4.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_4X;
+		} else {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_2X;
+		}
+	}
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		filter |= NV40TCL_TEX_FILTER_MAG_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		filter |= NV40TCL_TEX_FILTER_MAG_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV40TCL_TEX_FILTER_MIN_LINEAR_MIPMAP_NEAREST;
+			break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV40TCL_TEX_FILTER_MIN_LINEAR_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV40TCL_TEX_FILTER_MIN_LINEAR;
+			break;
+		}
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV40TCL_TEX_FILTER_MIN_NEAREST_MIPMAP_NEAREST;
+		break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV40TCL_TEX_FILTER_MIN_NEAREST_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV40TCL_TEX_FILTER_MIN_NEAREST;
+			break;
+		}
+		break;
+	}
+
+	ps->filt = filter;
+
+	{
+		float limit;
+
+		limit = CLAMP(cso->lod_bias, -16.0, 15.0);
+		ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff;
+
+		limit = CLAMP(cso->max_lod, 0.0, 15.0);
+		ps->en |= (int)(limit * 256.0) << 7;
+
+		limit = CLAMP(cso->min_lod, 0.0, 15.0);
+		ps->en |= (int)(limit * 256.0) << 19;
+	}
+
+
+	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+		switch (cso->compare_func) {
+		case PIPE_FUNC_NEVER:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_NEVER;
+			break;
+		case PIPE_FUNC_GREATER:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_GREATER;
+			break;
+		case PIPE_FUNC_EQUAL:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_EQUAL;
+			break;
+		case PIPE_FUNC_GEQUAL:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_GEQUAL;
+			break;
+		case PIPE_FUNC_LESS:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_LESS;
+			break;
+		case PIPE_FUNC_NOTEQUAL:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_NOTEQUAL;
+			break;
+		case PIPE_FUNC_LEQUAL:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_LEQUAL;
+			break;
+		case PIPE_FUNC_ALWAYS:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_ALWAYS;
+			break;
+		default:
+			break;
+		}
+	}
+
+	ps->bcol = ((float_to_ubyte(cso->border_color[3]) << 24) |
+		    (float_to_ubyte(cso->border_color[0]) << 16) |
+		    (float_to_ubyte(cso->border_color[1]) <<  8) |
+		    (float_to_ubyte(cso->border_color[2]) <<  0));
+
+	return (void *)ps;
+}
+
+static void
+nv40_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv40->tex_sampler[unit] = sampler[unit];
+		nv40->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nv40->nr_samplers; unit++) {
+		nv40->tex_sampler[unit] = NULL;
+		nv40->dirty_samplers |= (1 << unit);
+	}
+
+	nv40->nr_samplers = nr;
+	nv40->dirty |= NV40_NEW_SAMPLER;
+}
+
+static void
+nv40_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nv40_set_sampler_texture(struct pipe_context *pipe, unsigned nr,
+			 struct pipe_texture **miptree)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		pipe_texture_reference((struct pipe_texture **)
+				       &nv40->tex_miptree[unit], miptree[unit]);
+		nv40->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nv40->nr_textures; unit++) {
+		pipe_texture_reference((struct pipe_texture **)
+				       &nv40->tex_miptree[unit], NULL);
+		nv40->dirty_samplers |= (1 << unit);
+	}
+
+	nv40->nr_textures = nr;
+	nv40->dirty |= NV40_NEW_SAMPLER;
+}
+
+static void *
+nv40_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_rasterizer_state *rsso = CALLOC(1, sizeof(*rsso));
+	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_grobj *curie = nv40->screen->curie;
+
+	/*XXX: ignored:
+	 * 	light_twoside
+	 * 	point_smooth -nohw
+	 * 	multisample
+	 */
+
+	so_method(so, curie, NV40TCL_SHADE_MODEL, 1);
+	so_data  (so, cso->flatshade ? NV40TCL_SHADE_MODEL_FLAT :
+				       NV40TCL_SHADE_MODEL_SMOOTH);
+
+	so_method(so, curie, NV40TCL_LINE_WIDTH, 2);
+	so_data  (so, (unsigned char)(cso->line_width * 8.0) & 0xff);
+	so_data  (so, cso->line_smooth ? 1 : 0);
+	so_method(so, curie, NV40TCL_LINE_STIPPLE_ENABLE, 2);
+	so_data  (so, cso->line_stipple_enable ? 1 : 0);
+	so_data  (so, (cso->line_stipple_pattern << 16) |
+		       cso->line_stipple_factor);
+
+	so_method(so, curie, NV40TCL_POINT_SIZE, 1);
+	so_data  (so, fui(cso->point_size));
+
+	so_method(so, curie, NV40TCL_POLYGON_MODE_FRONT, 6);
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV40TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV40TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV40TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV40TCL_CULL_FACE_BACK);
+			break;
+		}
+		so_data(so, NV40TCL_FRONT_FACE_CCW);
+	} else {
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV40TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV40TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV40TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV40TCL_CULL_FACE_BACK);
+			break;
+		}
+		so_data(so, NV40TCL_FRONT_FACE_CW);
+	}
+	so_data(so, cso->poly_smooth ? 1 : 0);
+	so_data(so, (cso->cull_mode != PIPE_WINDING_NONE) ? 1 : 0);
+
+	so_method(so, curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
+	so_data  (so, cso->poly_stipple_enable ? 1 : 0);
+
+	so_method(so, curie, NV40TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_POINT) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_POINT))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_LINE) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_LINE))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_FILL) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_FILL))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if (cso->offset_cw || cso->offset_ccw) {
+		so_method(so, curie, NV40TCL_POLYGON_OFFSET_FACTOR, 2);
+		so_data  (so, fui(cso->offset_scale));
+		so_data  (so, fui(cso->offset_units * 2));
+	}
+
+	so_method(so, curie, NV40TCL_POINT_SPRITE, 1);
+	if (cso->point_sprite) {
+		unsigned psctl = (1 << 0), i;
+
+		for (i = 0; i < 8; i++) {
+			if (cso->sprite_coord_mode[i] != PIPE_SPRITE_COORD_NONE)
+				psctl |= (1 << (8 + i));
+		}
+
+		so_data(so, psctl);
+	} else {
+		so_data(so, 0);
+	}
+
+	so_ref(so, &rsso->so);
+	rsso->pipe = *cso;
+	return (void *)rsso;
+}
+
+static void
+nv40_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->rasterizer = hwcso;
+	nv40->dirty |= NV40_NEW_RAST;
+	nv40->draw_dirty |= NV40_NEW_RAST;
+}
+
+static void
+nv40_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_rasterizer_state *rsso = hwcso;
+
+	so_ref(NULL, &rsso->so);
+	FREE(rsso);
+}
+
+static void *
+nv40_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_zsa_state *zsaso = CALLOC(1, sizeof(*zsaso));
+	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_grobj *curie = nv40->screen->curie;
+
+	so_method(so, curie, NV40TCL_DEPTH_FUNC, 3);
+	so_data  (so, nvgl_comparison_op(cso->depth.func));
+	so_data  (so, cso->depth.writemask ? 1 : 0);
+	so_data  (so, cso->depth.enabled ? 1 : 0);
+
+	so_method(so, curie, NV40TCL_ALPHA_TEST_ENABLE, 3);
+	so_data  (so, cso->alpha.enabled ? 1 : 0);
+	so_data  (so, nvgl_comparison_op(cso->alpha.func));
+	so_data  (so, float_to_ubyte(cso->alpha.ref));
+
+	if (cso->stencil[0].enabled) {
+		so_method(so, curie, NV40TCL_STENCIL_FRONT_ENABLE, 8);
+		so_data  (so, cso->stencil[0].enabled ? 1 : 0);
+		so_data  (so, cso->stencil[0].write_mask);
+		so_data  (so, nvgl_comparison_op(cso->stencil[0].func));
+		so_data  (so, cso->stencil[0].ref_value);
+		so_data  (so, cso->stencil[0].value_mask);
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zpass_op));
+	} else {
+		so_method(so, curie, NV40TCL_STENCIL_FRONT_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	if (cso->stencil[1].enabled) {
+		so_method(so, curie, NV40TCL_STENCIL_BACK_ENABLE, 8);
+		so_data  (so, cso->stencil[1].enabled ? 1 : 0);
+		so_data  (so, cso->stencil[1].write_mask);
+		so_data  (so, nvgl_comparison_op(cso->stencil[1].func));
+		so_data  (so, cso->stencil[1].ref_value);
+		so_data  (so, cso->stencil[1].value_mask);
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zpass_op));
+	} else {
+		so_method(so, curie, NV40TCL_STENCIL_BACK_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_ref(so, &zsaso->so);
+	zsaso->pipe = *cso;
+	return (void *)zsaso;
+}
+
+static void
+nv40_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->zsa = hwcso;
+	nv40->dirty |= NV40_NEW_ZSA;
+}
+
+static void
+nv40_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_zsa_state *zsaso = hwcso;
+
+	so_ref(NULL, &zsaso->so);
+	FREE(zsaso);
+}
+
+static void *
+nv40_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_vertex_program *vp;
+
+	vp = CALLOC(1, sizeof(struct nv40_vertex_program));
+	vp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	vp->draw = draw_create_vertex_shader(nv40->draw, &vp->pipe);
+
+	return (void *)vp;
+}
+
+static void
+nv40_vp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->vertprog = hwcso;
+	nv40->dirty |= NV40_NEW_VERTPROG;
+	nv40->draw_dirty |= NV40_NEW_VERTPROG;
+}
+
+static void
+nv40_vp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_vertex_program *vp = hwcso;
+
+	draw_delete_vertex_shader(nv40->draw, vp->draw);
+	nv40_vertprog_destroy(nv40, vp);
+	FREE((void*)vp->pipe.tokens);
+	FREE(vp);
+}
+
+static void *
+nv40_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv40_fragment_program *fp;
+
+	fp = CALLOC(1, sizeof(struct nv40_fragment_program));
+	fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+
+	tgsi_scan_shader(fp->pipe.tokens, &fp->info);
+
+	return (void *)fp;
+}
+
+static void
+nv40_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->fragprog = hwcso;
+	nv40->dirty |= NV40_NEW_FRAGPROG;
+}
+
+static void
+nv40_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_fragment_program *fp = hwcso;
+
+	nv40_fragprog_destroy(nv40, fp);
+	FREE((void*)fp->pipe.tokens);
+	FREE(fp);
+}
+
+static void
+nv40_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->blend_colour = *bcol;
+	nv40->dirty |= NV40_NEW_BCOL;
+}
+
+static void
+nv40_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->clip = *clip;
+	nv40->dirty |= NV40_NEW_UCP;
+	nv40->draw_dirty |= NV40_NEW_UCP;
+}
+
+static void
+nv40_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->constbuf[shader] = buf->buffer;
+	nv40->constbuf_nr[shader] = buf->size / (4 * sizeof(float));
+
+	if (shader == PIPE_SHADER_VERTEX) {
+		nv40->dirty |= NV40_NEW_VERTPROG;
+	} else
+	if (shader == PIPE_SHADER_FRAGMENT) {
+		nv40->dirty |= NV40_NEW_FRAGPROG;
+	}
+}
+
+static void
+nv40_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->framebuffer = *fb;
+	nv40->dirty |= NV40_NEW_FB;
+}
+
+static void
+nv40_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	memcpy(nv40->stipple, stipple->stipple, 4 * 32);
+	nv40->dirty |= NV40_NEW_STIPPLE;
+}
+
+static void
+nv40_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->scissor = *s;
+	nv40->dirty |= NV40_NEW_SCISSOR;
+}
+
+static void
+nv40_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->viewport = *vpt;
+	nv40->dirty |= NV40_NEW_VIEWPORT;
+	nv40->draw_dirty |= NV40_NEW_VIEWPORT;
+}
+
+static void
+nv40_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	memcpy(nv40->vtxbuf, vb, sizeof(*vb) * count);
+	nv40->vtxbuf_nr = count;
+
+	nv40->dirty |= NV40_NEW_ARRAYS;
+	nv40->draw_dirty |= NV40_NEW_ARRAYS;
+}
+
+static void
+nv40_set_vertex_elements(struct pipe_context *pipe, unsigned count,
+			 const struct pipe_vertex_element *ve)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	memcpy(nv40->vtxelt, ve, sizeof(*ve) * count);
+	nv40->vtxelt_nr = count;
+
+	nv40->dirty |= NV40_NEW_ARRAYS;
+	nv40->draw_dirty |= NV40_NEW_ARRAYS;
+}
+
+static void
+nv40_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->edgeflags = bitfield;
+	nv40->dirty |= NV40_NEW_ARRAYS;
+	nv40->draw_dirty |= NV40_NEW_ARRAYS;
+}
+
+void
+nv40_init_state_functions(struct nv40_context *nv40)
+{
+	nv40->pipe.create_blend_state = nv40_blend_state_create;
+	nv40->pipe.bind_blend_state = nv40_blend_state_bind;
+	nv40->pipe.delete_blend_state = nv40_blend_state_delete;
+
+	nv40->pipe.create_sampler_state = nv40_sampler_state_create;
+	nv40->pipe.bind_sampler_states = nv40_sampler_state_bind;
+	nv40->pipe.delete_sampler_state = nv40_sampler_state_delete;
+	nv40->pipe.set_sampler_textures = nv40_set_sampler_texture;
+
+	nv40->pipe.create_rasterizer_state = nv40_rasterizer_state_create;
+	nv40->pipe.bind_rasterizer_state = nv40_rasterizer_state_bind;
+	nv40->pipe.delete_rasterizer_state = nv40_rasterizer_state_delete;
+
+	nv40->pipe.create_depth_stencil_alpha_state =
+		nv40_depth_stencil_alpha_state_create;
+	nv40->pipe.bind_depth_stencil_alpha_state =
+		nv40_depth_stencil_alpha_state_bind;
+	nv40->pipe.delete_depth_stencil_alpha_state =
+		nv40_depth_stencil_alpha_state_delete;
+
+	nv40->pipe.create_vs_state = nv40_vp_state_create;
+	nv40->pipe.bind_vs_state = nv40_vp_state_bind;
+	nv40->pipe.delete_vs_state = nv40_vp_state_delete;
+
+	nv40->pipe.create_fs_state = nv40_fp_state_create;
+	nv40->pipe.bind_fs_state = nv40_fp_state_bind;
+	nv40->pipe.delete_fs_state = nv40_fp_state_delete;
+
+	nv40->pipe.set_blend_color = nv40_set_blend_color;
+	nv40->pipe.set_clip_state = nv40_set_clip_state;
+	nv40->pipe.set_constant_buffer = nv40_set_constant_buffer;
+	nv40->pipe.set_framebuffer_state = nv40_set_framebuffer_state;
+	nv40->pipe.set_polygon_stipple = nv40_set_polygon_stipple;
+	nv40->pipe.set_scissor_state = nv40_set_scissor_state;
+	nv40->pipe.set_viewport_state = nv40_set_viewport_state;
+
+	nv40->pipe.set_edgeflags = nv40_set_edgeflags;
+	nv40->pipe.set_vertex_buffers = nv40_set_vertex_buffers;
+	nv40->pipe.set_vertex_elements = nv40_set_vertex_elements;
+}
+
diff --git a/src/gallium/drivers/nv40/nv40_state.h b/src/gallium/drivers/nv40/nv40_state.h
new file mode 100644
index 00000000000..8a9d8c8fdf6
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state.h
@@ -0,0 +1,88 @@
+#ifndef __NV40_STATE_H__
+#define __NV40_STATE_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nv40_sampler_state {
+	uint32_t fmt;
+	uint32_t wrap;
+	uint32_t en;
+	uint32_t filt;
+	uint32_t bcol;
+};
+
+struct nv40_vertex_program_exec {
+	uint32_t data[4];
+	boolean has_branch_offset;
+	int const_index;
+};
+
+struct nv40_vertex_program_data {
+	int index; /* immediates == -1 */
+	float value[4];
+};
+
+struct nv40_vertex_program {
+	struct pipe_shader_state pipe;
+
+	struct draw_vertex_shader *draw;
+
+	boolean translated;
+
+	struct pipe_clip_state ucp;
+
+	struct nv40_vertex_program_exec *insns;
+	unsigned nr_insns;
+	struct nv40_vertex_program_data *consts;
+	unsigned nr_consts;
+
+	struct nouveau_resource *exec;
+	unsigned exec_start;
+	struct nouveau_resource *data;
+	unsigned data_start;
+	unsigned data_start_min;
+
+	uint32_t ir;
+	uint32_t or;
+	uint32_t clip_ctrl;
+	struct nouveau_stateobj *so;
+};
+
+struct nv40_fragment_program_data {
+	unsigned offset;
+	unsigned index;
+};
+
+struct nv40_fragment_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+
+	boolean translated;
+	unsigned samplers;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct nv40_fragment_program_data *consts;
+	unsigned nr_consts;
+
+	struct pipe_buffer *buffer;
+
+	uint32_t fp_control;
+	struct nouveau_stateobj *so;
+};
+
+struct nv40_miptree {
+	struct pipe_texture base;
+
+	struct pipe_buffer *buffer;
+	uint total_size;
+
+	struct {
+		uint pitch;
+		uint *image_offset;
+	} level[PIPE_MAX_TEXTURE_LEVELS];
+};
+
+#endif
diff --git a/src/gallium/drivers/nv40/nv40_state_blend.c b/src/gallium/drivers/nv40/nv40_state_blend.c
new file mode 100644
index 00000000000..95e6d7394f4
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_blend.c
@@ -0,0 +1,40 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_blend_validate(struct nv40_context *nv40)
+{
+	so_ref(nv40->blend->so, &nv40->state.hw[NV40_STATE_BLEND]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_blend = {
+	.validate = nv40_state_blend_validate,
+	.dirty = {
+		.pipe = NV40_NEW_BLEND,
+		.hw = NV40_STATE_BLEND
+	}
+};
+
+static boolean
+nv40_state_blend_colour_validate(struct nv40_context *nv40)
+{
+	struct nouveau_stateobj *so = so_new(2, 0);
+	struct pipe_blend_color *bcol = &nv40->blend_colour;
+
+	so_method(so, nv40->screen->curie, NV40TCL_BLEND_COLOR, 1);
+	so_data  (so, ((float_to_ubyte(bcol->color[3]) << 24) |
+		       (float_to_ubyte(bcol->color[0]) << 16) |
+		       (float_to_ubyte(bcol->color[1]) <<  8) |
+		       (float_to_ubyte(bcol->color[2]) <<  0)));
+
+	so_ref(so, &nv40->state.hw[NV40_STATE_BCOL]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_blend_colour = {
+	.validate = nv40_state_blend_colour_validate,
+	.dirty = {
+		.pipe = NV40_NEW_BCOL,
+		.hw = NV40_STATE_BCOL
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_emit.c b/src/gallium/drivers/nv40/nv40_state_emit.c
new file mode 100644
index 00000000000..ab88dc416e5
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_emit.c
@@ -0,0 +1,184 @@
+#include "nv40_context.h"
+#include "nv40_state.h"
+#include "draw/draw_context.h"
+
+static struct nv40_state_entry *render_states[] = {
+	&nv40_state_framebuffer,
+	&nv40_state_rasterizer,
+	&nv40_state_scissor,
+	&nv40_state_stipple,
+	&nv40_state_fragprog,
+	&nv40_state_fragtex,
+	&nv40_state_vertprog,
+	&nv40_state_blend,
+	&nv40_state_blend_colour,
+	&nv40_state_zsa,
+	&nv40_state_viewport,
+	&nv40_state_vbo,
+	NULL
+};
+
+static struct nv40_state_entry *swtnl_states[] = {
+	&nv40_state_framebuffer,
+	&nv40_state_rasterizer,
+	&nv40_state_scissor,
+	&nv40_state_stipple,
+	&nv40_state_fragprog,
+	&nv40_state_fragtex,
+	&nv40_state_vertprog,
+	&nv40_state_blend,
+	&nv40_state_blend_colour,
+	&nv40_state_zsa,
+	&nv40_state_viewport,
+	&nv40_state_vtxfmt,
+	NULL
+};
+
+static void
+nv40_state_do_validate(struct nv40_context *nv40,
+		       struct nv40_state_entry **states)
+{
+	const struct pipe_framebuffer_state *fb = &nv40->framebuffer;
+	unsigned i;
+
+	for (i = 0; i < fb->num_cbufs; i++)
+		fb->cbufs[i]->status = PIPE_SURFACE_STATUS_DEFINED;
+	if (fb->zsbuf)
+		fb->zsbuf->status = PIPE_SURFACE_STATUS_DEFINED;
+
+	while (*states) {
+		struct nv40_state_entry *e = *states;
+
+		if (nv40->dirty & e->dirty.pipe) {
+			if (e->validate(nv40))
+				nv40->state.dirty |= (1ULL << e->dirty.hw);
+		}
+
+		states++;
+	}
+	nv40->dirty = 0;
+}
+
+void
+nv40_state_emit(struct nv40_context *nv40)
+{
+	struct nv40_state *state = &nv40->state;
+	struct nv40_screen *screen = nv40->screen;
+	unsigned i, samplers;
+	uint64 states;
+
+	if (nv40->pctx_id != screen->cur_pctx) {
+		for (i = 0; i < NV40_STATE_MAX; i++) {
+			if (state->hw[i] && screen->state[i] != state->hw[i])
+				state->dirty |= (1ULL << i);
+		}
+
+		screen->cur_pctx = nv40->pctx_id;
+	}
+
+	for (i = 0, states = state->dirty; states; i++) {
+		if (!(states & (1ULL << i)))
+			continue;
+		so_ref (state->hw[i], &nv40->screen->state[i]);
+		if (state->hw[i])
+			so_emit(nv40->nvws, nv40->screen->state[i]);
+		states &= ~(1ULL << i);
+	}
+
+	if (state->dirty & ((1ULL << NV40_STATE_FRAGPROG) |
+			    (1ULL << NV40_STATE_FRAGTEX0))) {
+		BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
+		OUT_RING  (2);
+		BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
+		OUT_RING  (1);
+	}
+
+	state->dirty = 0;
+
+	so_emit_reloc_markers(nv40->nvws, state->hw[NV40_STATE_FB]);
+	for (i = 0, samplers = state->fp_samplers; i < 16 && samplers; i++) {
+		if (!(samplers & (1 << i)))
+			continue;
+		so_emit_reloc_markers(nv40->nvws,
+				      state->hw[NV40_STATE_FRAGTEX0+i]);
+		samplers &= ~(1ULL << i);
+	}
+	so_emit_reloc_markers(nv40->nvws, state->hw[NV40_STATE_FRAGPROG]);
+	if (state->hw[NV40_STATE_VTXBUF] && nv40->render_mode == HW)
+		so_emit_reloc_markers(nv40->nvws, state->hw[NV40_STATE_VTXBUF]);
+}
+
+boolean
+nv40_state_validate(struct nv40_context *nv40)
+{
+	boolean was_sw = nv40->fallback_swtnl ? TRUE : FALSE;
+
+	if (nv40->render_mode != HW) {
+		/* Don't even bother trying to go back to hw if none
+		 * of the states that caused swtnl previously have changed.
+		 */
+		if ((nv40->fallback_swtnl & nv40->dirty)
+				!= nv40->fallback_swtnl)
+			return FALSE;
+
+		/* Attempt to go to hwtnl again */
+		nv40->pipe.flush(&nv40->pipe, 0, NULL);
+		nv40->dirty |= (NV40_NEW_VIEWPORT |
+				NV40_NEW_VERTPROG |
+				NV40_NEW_ARRAYS);
+		nv40->render_mode = HW;
+	}
+
+	nv40_state_do_validate(nv40, render_states);
+	if (nv40->fallback_swtnl || nv40->fallback_swrast)
+		return FALSE;
+	
+	if (was_sw)
+		NOUVEAU_ERR("swtnl->hw\n");
+
+	return TRUE;
+}
+
+boolean
+nv40_state_validate_swtnl(struct nv40_context *nv40)
+{
+	struct draw_context *draw = nv40->draw;
+
+	/* Setup for swtnl */
+	if (nv40->render_mode == HW) {
+		NOUVEAU_ERR("hw->swtnl 0x%08x\n", nv40->fallback_swtnl);
+		nv40->pipe.flush(&nv40->pipe, 0, NULL);
+		nv40->dirty |= (NV40_NEW_VIEWPORT |
+				NV40_NEW_VERTPROG |
+				NV40_NEW_ARRAYS);
+		nv40->render_mode = SWTNL;
+	}
+
+	if (nv40->draw_dirty & NV40_NEW_VERTPROG)
+		draw_bind_vertex_shader(draw, nv40->vertprog->draw);
+
+	if (nv40->draw_dirty & NV40_NEW_RAST)
+		draw_set_rasterizer_state(draw, &nv40->rasterizer->pipe);
+
+	if (nv40->draw_dirty & NV40_NEW_UCP)
+		draw_set_clip_state(draw, &nv40->clip);
+
+	if (nv40->draw_dirty & NV40_NEW_VIEWPORT)
+		draw_set_viewport_state(draw, &nv40->viewport);
+
+	if (nv40->draw_dirty & NV40_NEW_ARRAYS) {
+		draw_set_edgeflags(draw, nv40->edgeflags);
+		draw_set_vertex_buffers(draw, nv40->vtxbuf_nr, nv40->vtxbuf);
+		draw_set_vertex_elements(draw, nv40->vtxelt_nr, nv40->vtxelt);	
+	}
+
+	nv40_state_do_validate(nv40, swtnl_states);
+	if (nv40->fallback_swrast) {
+		NOUVEAU_ERR("swtnl->swrast 0x%08x\n", nv40->fallback_swrast);
+		return FALSE;
+	}
+
+	nv40->draw_dirty = 0;
+	return TRUE;
+}
+
diff --git a/src/gallium/drivers/nv40/nv40_state_fb.c b/src/gallium/drivers/nv40/nv40_state_fb.c
new file mode 100644
index 00000000000..0e4e60eaa75
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_fb.c
@@ -0,0 +1,144 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_framebuffer_validate(struct nv40_context *nv40)
+{
+	struct pipe_framebuffer_state *fb = &nv40->framebuffer;
+	struct pipe_surface *rt[4], *zeta;
+	uint32_t rt_enable, rt_format;
+	int i, colour_format = 0, zeta_format = 0;
+	struct nouveau_stateobj *so = so_new(64, 10);
+	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
+	unsigned w = fb->width;
+	unsigned h = fb->height;
+
+	rt_enable = 0;
+	for (i = 0; i < fb->num_cbufs; i++) {
+		if (colour_format) {
+			assert(colour_format == fb->cbufs[i]->format);
+		} else {
+			colour_format = fb->cbufs[i]->format;
+			rt_enable |= (NV40TCL_RT_ENABLE_COLOR0 << i);
+			rt[i] = fb->cbufs[i];
+		}
+	}
+
+	if (rt_enable & (NV40TCL_RT_ENABLE_COLOR1 | NV40TCL_RT_ENABLE_COLOR2 |
+			 NV40TCL_RT_ENABLE_COLOR3))
+		rt_enable |= NV40TCL_RT_ENABLE_MRT;
+
+	if (fb->zsbuf) {
+		zeta_format = fb->zsbuf->format;
+		zeta = fb->zsbuf;
+	}
+
+	rt_format = NV40TCL_RT_FORMAT_TYPE_LINEAR;
+
+	switch (colour_format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case 0:
+		rt_format |= NV40TCL_RT_FORMAT_COLOR_A8R8G8B8;
+		break;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		rt_format |= NV40TCL_RT_FORMAT_COLOR_R5G6B5;
+		break;
+	default:
+		assert(0);
+	}
+
+	switch (zeta_format) {
+	case PIPE_FORMAT_Z16_UNORM:
+		rt_format |= NV40TCL_RT_FORMAT_ZETA_Z16;
+		break;
+	case PIPE_FORMAT_Z24S8_UNORM:
+	case 0:
+		rt_format |= NV40TCL_RT_FORMAT_ZETA_Z24S8;
+		break;
+	default:
+		assert(0);
+	}
+
+	if (rt_enable & NV40TCL_RT_ENABLE_COLOR0) {
+		so_method(so, nv40->screen->curie, NV40TCL_DMA_COLOR0, 1);
+		so_reloc (so, rt[0]->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR0_PITCH, 2);
+		so_data  (so, rt[0]->stride);
+		so_reloc (so, rt[0]->buffer, rt[0]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+	}
+
+	if (rt_enable & NV40TCL_RT_ENABLE_COLOR1) {
+		so_method(so, nv40->screen->curie, NV40TCL_DMA_COLOR1, 1);
+		so_reloc (so, rt[1]->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR1_OFFSET, 2);
+		so_reloc (so, rt[1]->buffer, rt[1]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_data  (so, rt[1]->stride);
+	}
+
+	if (rt_enable & NV40TCL_RT_ENABLE_COLOR2) {
+		so_method(so, nv40->screen->curie, NV40TCL_DMA_COLOR2, 1);
+		so_reloc (so, rt[2]->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR2_OFFSET, 1);
+		so_reloc (so, rt[2]->buffer, rt[2]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR2_PITCH, 1);
+		so_data  (so, rt[2]->stride);
+	}
+
+	if (rt_enable & NV40TCL_RT_ENABLE_COLOR3) {
+		so_method(so, nv40->screen->curie, NV40TCL_DMA_COLOR3, 1);
+		so_reloc (so, rt[3]->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR3_OFFSET, 1);
+		so_reloc (so, rt[3]->buffer, rt[3]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR3_PITCH, 1);
+		so_data  (so, rt[3]->stride);
+	}
+
+	if (zeta_format) {
+		so_method(so, nv40->screen->curie, NV40TCL_DMA_ZETA, 1);
+		so_reloc (so, zeta->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->screen->curie, NV40TCL_ZETA_OFFSET, 1);
+		so_reloc (so, zeta->buffer, zeta->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_method(so, nv40->screen->curie, NV40TCL_ZETA_PITCH, 1);
+		so_data  (so, zeta->stride);
+	}
+
+	so_method(so, nv40->screen->curie, NV40TCL_RT_ENABLE, 1);
+	so_data  (so, rt_enable);
+	so_method(so, nv40->screen->curie, NV40TCL_RT_HORIZ, 3);
+	so_data  (so, (w << 16) | 0);
+	so_data  (so, (h << 16) | 0);
+	so_data  (so, rt_format);
+	so_method(so, nv40->screen->curie, NV40TCL_VIEWPORT_HORIZ, 2);
+	so_data  (so, (w << 16) | 0);
+	so_data  (so, (h << 16) | 0);
+	so_method(so, nv40->screen->curie, NV40TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	so_data  (so, ((w - 1) << 16) | 0);
+	so_data  (so, ((h - 1) << 16) | 0);
+	so_method(so, nv40->screen->curie, 0x1d88, 1);
+	so_data  (so, (1 << 12) | h);
+
+	so_ref(so, &nv40->state.hw[NV40_STATE_FB]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_framebuffer = {
+	.validate = nv40_state_framebuffer_validate,
+	.dirty = {
+		.pipe = NV40_NEW_FB,
+		.hw = NV40_STATE_FB
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_rasterizer.c b/src/gallium/drivers/nv40/nv40_state_rasterizer.c
new file mode 100644
index 00000000000..9ecda5990f0
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_rasterizer.c
@@ -0,0 +1,17 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_rasterizer_validate(struct nv40_context *nv40)
+{
+	so_ref(nv40->rasterizer->so,
+	       &nv40->state.hw[NV40_STATE_RAST]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_rasterizer = {
+	.validate = nv40_state_rasterizer_validate,
+	.dirty = {
+		.pipe = NV40_NEW_RAST,
+		.hw = NV40_STATE_RAST
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_scissor.c b/src/gallium/drivers/nv40/nv40_state_scissor.c
new file mode 100644
index 00000000000..285239ef419
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_scissor.c
@@ -0,0 +1,35 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_scissor_validate(struct nv40_context *nv40)
+{
+	struct pipe_rasterizer_state *rast = &nv40->rasterizer->pipe;
+	struct pipe_scissor_state *s = &nv40->scissor;
+	struct nouveau_stateobj *so;
+
+	if (nv40->state.hw[NV40_STATE_SCISSOR] &&
+	    (rast->scissor == 0 && nv40->state.scissor_enabled == 0))
+		return FALSE;
+	nv40->state.scissor_enabled = rast->scissor;
+
+	so = so_new(3, 0);
+	so_method(so, nv40->screen->curie, NV40TCL_SCISSOR_HORIZ, 2);
+	if (nv40->state.scissor_enabled) {
+		so_data  (so, ((s->maxx - s->minx) << 16) | s->minx);
+		so_data  (so, ((s->maxy - s->miny) << 16) | s->miny);
+	} else {
+		so_data  (so, 4096 << 16);
+		so_data  (so, 4096 << 16);
+	}
+
+	so_ref(so, &nv40->state.hw[NV40_STATE_SCISSOR]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_scissor = {
+	.validate = nv40_state_scissor_validate,
+	.dirty = {
+		.pipe = NV40_NEW_SCISSOR | NV40_NEW_RAST,
+		.hw = NV40_STATE_SCISSOR
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_stipple.c b/src/gallium/drivers/nv40/nv40_state_stipple.c
new file mode 100644
index 00000000000..b51024ad9b2
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_stipple.c
@@ -0,0 +1,39 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_stipple_validate(struct nv40_context *nv40)
+{
+	struct pipe_rasterizer_state *rast = &nv40->rasterizer->pipe;
+	struct nouveau_grobj *curie = nv40->screen->curie;
+	struct nouveau_stateobj *so;
+
+	if (nv40->state.hw[NV40_STATE_STIPPLE] &&
+	   (rast->poly_stipple_enable == 0 && nv40->state.stipple_enabled == 0))
+		return FALSE;
+
+	if (rast->poly_stipple_enable) {
+		unsigned i;
+
+		so = so_new(35, 0);
+		so_method(so, curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
+		so_data  (so, 1);
+		so_method(so, curie, NV40TCL_POLYGON_STIPPLE_PATTERN(0), 32);
+		for (i = 0; i < 32; i++)
+			so_data(so, nv40->stipple[i]);
+	} else {
+		so = so_new(2, 0);
+		so_method(so, curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_ref(so, &nv40->state.hw[NV40_STATE_STIPPLE]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_stipple = {
+	.validate = nv40_state_stipple_validate,
+	.dirty = {
+		.pipe = NV40_NEW_STIPPLE | NV40_NEW_RAST,
+		.hw = NV40_STATE_STIPPLE,
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_viewport.c b/src/gallium/drivers/nv40/nv40_state_viewport.c
new file mode 100644
index 00000000000..869a55b4053
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_viewport.c
@@ -0,0 +1,67 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_viewport_validate(struct nv40_context *nv40)
+{
+	struct pipe_viewport_state *vpt = &nv40->viewport;
+	struct nouveau_stateobj *so;
+	unsigned bypass;
+
+	if (nv40->render_mode == HW && !nv40->rasterizer->pipe.bypass_clipping)
+		bypass = 0;
+	else
+		bypass = 1;
+
+	if (nv40->state.hw[NV40_STATE_VIEWPORT] &&
+	    (bypass || !(nv40->dirty & NV40_NEW_VIEWPORT)) &&
+	    nv40->state.viewport_bypass == bypass)
+		return FALSE;
+	nv40->state.viewport_bypass = bypass;
+
+	so = so_new(11, 0);
+	if (!bypass) {
+		so_method(so, nv40->screen->curie,
+			  NV40TCL_VIEWPORT_TRANSLATE_X, 8);
+		so_data  (so, fui(vpt->translate[0]));
+		so_data  (so, fui(vpt->translate[1]));
+		so_data  (so, fui(vpt->translate[2]));
+		so_data  (so, fui(vpt->translate[3]));
+		so_data  (so, fui(vpt->scale[0]));
+		so_data  (so, fui(vpt->scale[1]));
+		so_data  (so, fui(vpt->scale[2]));
+		so_data  (so, fui(vpt->scale[3]));
+		so_method(so, nv40->screen->curie, 0x1d78, 1);
+		so_data  (so, 1);
+	} else {
+		so_method(so, nv40->screen->curie,
+			  NV40TCL_VIEWPORT_TRANSLATE_X, 8);
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(0.0));
+		/* Not entirely certain what this is yet.  The DDX uses this
+		 * value also as it fixes rendering when you pass
+		 * pre-transformed vertices to the GPU.  My best gusss is that
+		 * this bypasses some culling/clipping stage.  Might be worth
+		 * noting that points/lines are uneffected by whatever this
+		 * value fixes, only filled polygons are effected.
+		 */
+		so_method(so, nv40->screen->curie, 0x1d78, 1);
+		so_data  (so, 0x110);
+	}
+
+	so_ref(so, &nv40->state.hw[NV40_STATE_VIEWPORT]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_viewport = {
+	.validate = nv40_state_viewport_validate,
+	.dirty = {
+		.pipe = NV40_NEW_VIEWPORT | NV40_NEW_RAST,
+		.hw = NV40_STATE_VIEWPORT
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_zsa.c b/src/gallium/drivers/nv40/nv40_state_zsa.c
new file mode 100644
index 00000000000..fb760677c88
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_zsa.c
@@ -0,0 +1,17 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_zsa_validate(struct nv40_context *nv40)
+{
+	so_ref(nv40->zsa->so,
+	       &nv40->state.hw[NV40_STATE_ZSA]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_zsa = {
+	.validate = nv40_state_zsa_validate,
+	.dirty = {
+		.pipe = NV40_NEW_ZSA,
+		.hw = NV40_STATE_ZSA
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_surface.c b/src/gallium/drivers/nv40/nv40_surface.c
new file mode 100644
index 00000000000..576af7c59ee
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_surface.c
@@ -0,0 +1,77 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "nv40_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
+
+#include "util/u_tile.h"
+
+static void
+nv40_surface_copy(struct pipe_context *pipe, boolean do_flip,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_winsys *nvws = nv40->nvws;
+
+	if (do_flip) {
+		/*XXX: This dodgyness will do for now for correctness.  But,
+		 *     need to investigate whether the 2D engine is able to
+		 *     manage a flip (perhaps SIFM?), if not, use the 3D engine
+		 */
+		desty += height;
+		while (height--) {
+			nvws->surface_copy(nvws, dest, destx, desty--, src,
+					   srcx, srcy++, width, 1);
+		}
+	} else {
+		nvws->surface_copy(nvws, dest, destx, desty, src, srcx, srcy,
+				   width, height);
+	}
+}
+
+static void
+nv40_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_winsys *nvws = nv40->nvws;
+
+	nvws->surface_fill(nvws, dest, destx, desty, width, height, value);
+}
+
+void
+nv40_init_surface_functions(struct nv40_context *nv40)
+{
+	nv40->pipe.surface_copy = nv40_surface_copy;
+	nv40->pipe.surface_fill = nv40_surface_fill;
+}
diff --git a/src/gallium/drivers/nv40/nv40_vbo.c b/src/gallium/drivers/nv40/nv40_vbo.c
new file mode 100644
index 00000000000..09f6e79d32a
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_vbo.c
@@ -0,0 +1,555 @@
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "nv40_context.h"
+#include "nv40_state.h"
+
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_pushbuf.h"
+#include "nouveau/nouveau_util.h"
+
+#define FORCE_SWTNL 0
+
+static INLINE int
+nv40_vbo_format_to_hw(enum pipe_format pipe, unsigned *fmt, unsigned *ncomp)
+{
+	switch (pipe) {
+	case PIPE_FORMAT_R32_FLOAT:
+	case PIPE_FORMAT_R32G32_FLOAT:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+		*fmt = NV40TCL_VTXFMT_TYPE_FLOAT;
+		break;
+	case PIPE_FORMAT_R8_UNORM:
+	case PIPE_FORMAT_R8G8_UNORM:
+	case PIPE_FORMAT_R8G8B8_UNORM:
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+		*fmt = NV40TCL_VTXFMT_TYPE_UBYTE;
+		break;
+	case PIPE_FORMAT_R16_SSCALED:
+	case PIPE_FORMAT_R16G16_SSCALED:
+	case PIPE_FORMAT_R16G16B16_SSCALED:
+	case PIPE_FORMAT_R16G16B16A16_SSCALED:
+		*fmt = NV40TCL_VTXFMT_TYPE_USHORT;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown format %s\n", pf_name(pipe));
+		return 1;
+	}
+
+	switch (pipe) {
+	case PIPE_FORMAT_R8_UNORM:
+	case PIPE_FORMAT_R32_FLOAT:
+	case PIPE_FORMAT_R16_SSCALED:
+		*ncomp = 1;
+		break;
+	case PIPE_FORMAT_R8G8_UNORM:
+	case PIPE_FORMAT_R32G32_FLOAT:
+	case PIPE_FORMAT_R16G16_SSCALED:
+		*ncomp = 2;
+		break;
+	case PIPE_FORMAT_R8G8B8_UNORM:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+	case PIPE_FORMAT_R16G16B16_SSCALED:
+		*ncomp = 3;
+		break;
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+	case PIPE_FORMAT_R16G16B16A16_SSCALED:
+		*ncomp = 4;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown format %s\n", pf_name(pipe));
+		return 1;
+	}
+
+	return 0;
+}
+
+static boolean
+nv40_vbo_set_idxbuf(struct nv40_context *nv40, struct pipe_buffer *ib,
+		    unsigned ib_size)
+{
+	struct pipe_screen *pscreen = &nv40->screen->pipe;
+	unsigned type;
+
+	if (!ib) {
+		nv40->idxbuf = NULL;
+		nv40->idxbuf_format = 0xdeadbeef;
+		return FALSE;
+	}
+
+	if (!pscreen->get_param(pscreen, NOUVEAU_CAP_HW_IDXBUF) || ib_size == 1)
+		return FALSE;
+
+	switch (ib_size) {
+	case 2:
+		type = NV40TCL_IDXBUF_FORMAT_TYPE_U16;
+		break;
+	case 4:
+		type = NV40TCL_IDXBUF_FORMAT_TYPE_U32;
+		break;
+	default:
+		return FALSE;
+	}
+
+	if (ib != nv40->idxbuf ||
+	    type != nv40->idxbuf_format) {
+		nv40->dirty |= NV40_NEW_ARRAYS;
+		nv40->idxbuf = ib;
+		nv40->idxbuf_format = type;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv40_vbo_static_attrib(struct nv40_context *nv40, struct nouveau_stateobj *so,
+		       int attrib, struct pipe_vertex_element *ve,
+		       struct pipe_vertex_buffer *vb)
+{
+	struct pipe_winsys *ws = nv40->pipe.winsys;
+	struct nouveau_grobj *curie = nv40->screen->curie;
+	unsigned type, ncomp;
+	void *map;
+
+	if (nv40_vbo_format_to_hw(ve->src_format, &type, &ncomp))
+		return FALSE;
+
+	map  = ws->buffer_map(ws, vb->buffer, PIPE_BUFFER_USAGE_CPU_READ);
+	map += vb->buffer_offset + ve->src_offset;
+
+	switch (type) {
+	case NV40TCL_VTXFMT_TYPE_FLOAT:
+	{
+		float *v = map;
+
+		switch (ncomp) {
+		case 4:
+			so_method(so, curie, NV40TCL_VTX_ATTR_4F_X(attrib), 4);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			so_data  (so, fui(v[2]));
+			so_data  (so, fui(v[3]));
+			break;
+		case 3:
+			so_method(so, curie, NV40TCL_VTX_ATTR_3F_X(attrib), 3);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			so_data  (so, fui(v[2]));
+			break;
+		case 2:
+			so_method(so, curie, NV40TCL_VTX_ATTR_2F_X(attrib), 2);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			break;
+		case 1:
+			so_method(so, curie, NV40TCL_VTX_ATTR_1F(attrib), 1);
+			so_data  (so, fui(v[0]));
+			break;
+		default:
+			ws->buffer_unmap(ws, vb->buffer);
+			return FALSE;
+		}
+	}
+		break;
+	default:
+		ws->buffer_unmap(ws, vb->buffer);
+		return FALSE;
+	}
+
+	ws->buffer_unmap(ws, vb->buffer);
+
+	return TRUE;
+}
+
+boolean
+nv40_draw_arrays(struct pipe_context *pipe,
+		 unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_channel *chan = nv40->nvws->channel;
+	unsigned restart;
+
+	nv40_vbo_set_idxbuf(nv40, NULL, 0);
+	if (FORCE_SWTNL || !nv40_state_validate(nv40)) {
+		return nv40_draw_elements_swtnl(pipe, NULL, 0,
+						mode, start, count);
+	}
+
+	while (count) {
+		unsigned vc, nr;
+
+		nv40_state_emit(nv40);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
+					mode, start, count, &restart);
+		if (!vc) {
+			FIRE_RING(NULL);
+			continue;
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		nr = (vc & 0xff);
+		if (nr) {
+			BEGIN_RING(curie, NV40TCL_VB_VERTEX_BATCH, 1);
+			OUT_RING  (((nr - 1) << 24) | start);
+			start += nr;
+		}
+
+		nr = vc >> 8;
+		while (nr) {
+			unsigned push = nr > 2047 ? 2047 : nr;
+
+			nr -= push;
+
+			BEGIN_RING_NI(curie, NV40TCL_VB_VERTEX_BATCH, push);
+			while (push--) {
+				OUT_RING(((0x100 - 1) << 24) | start);
+				start += 0x100;
+			}
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		count -= vc;
+		start = restart;
+	}
+
+	pipe->flush(pipe, 0, NULL);
+	return TRUE;
+}
+
+static INLINE void
+nv40_draw_elements_u08(struct nv40_context *nv40, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv40->nvws->channel;
+
+	while (count) {
+		uint8_t *elts = (uint8_t *)ib + start;
+		unsigned vc, push, restart;
+
+		nv40_state_emit(nv40);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		if (vc & 1) {
+			BEGIN_RING(curie, NV40TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (elts[0]);
+			elts++; vc--;
+		}
+
+		while (vc) {
+			unsigned i;
+
+			push = MIN2(vc, 2047 * 2);
+
+			BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U16, push >> 1);
+			for (i = 0; i < push; i+=2)
+				OUT_RING((elts[i+1] << 16) | elts[i]);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static INLINE void
+nv40_draw_elements_u16(struct nv40_context *nv40, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv40->nvws->channel;
+
+	while (count) {
+		uint16_t *elts = (uint16_t *)ib + start;
+		unsigned vc, push, restart;
+
+		nv40_state_emit(nv40);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		if (vc & 1) {
+			BEGIN_RING(curie, NV40TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (elts[0]);
+			elts++; vc--;
+		}
+
+		while (vc) {
+			unsigned i;
+
+			push = MIN2(vc, 2047 * 2);
+
+			BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U16, push >> 1);
+			for (i = 0; i < push; i+=2)
+				OUT_RING((elts[i+1] << 16) | elts[i]);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static INLINE void
+nv40_draw_elements_u32(struct nv40_context *nv40, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv40->nvws->channel;
+
+	while (count) {
+		uint32_t *elts = (uint32_t *)ib + start;
+		unsigned vc, push, restart;
+
+		nv40_state_emit(nv40);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 5, 1,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		while (vc) {
+			push = MIN2(vc, 2047);
+
+			BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U32, push);
+			OUT_RINGp    (elts, push);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static boolean
+nv40_draw_elements_inline(struct pipe_context *pipe,
+			  struct pipe_buffer *ib, unsigned ib_size,
+			  unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+	void *map;
+
+	map = ws->buffer_map(ws, ib, PIPE_BUFFER_USAGE_CPU_READ);
+	if (!ib) {
+		NOUVEAU_ERR("failed mapping ib\n");
+		return FALSE;
+	}
+
+	switch (ib_size) {
+	case 1:
+		nv40_draw_elements_u08(nv40, map, mode, start, count);
+		break;
+	case 2:
+		nv40_draw_elements_u16(nv40, map, mode, start, count);
+		break;
+	case 4:
+		nv40_draw_elements_u32(nv40, map, mode, start, count);
+		break;
+	default:
+		NOUVEAU_ERR("invalid idxbuf fmt %d\n", ib_size);
+		break;
+	}
+
+	ws->buffer_unmap(ws, ib);
+	return TRUE;
+}
+
+static boolean
+nv40_draw_elements_vbo(struct pipe_context *pipe,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_channel *chan = nv40->nvws->channel;
+	unsigned restart;
+
+	while (count) {
+		unsigned nr, vc;
+
+		nv40_state_emit(nv40);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
+					mode, start, count, &restart);
+		if (!vc) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		nr = (vc & 0xff);
+		if (nr) {
+			BEGIN_RING(curie, NV40TCL_VB_INDEX_BATCH, 1);
+			OUT_RING  (((nr - 1) << 24) | start);
+			start += nr;
+		}
+
+		nr = vc >> 8;
+		while (nr) {
+			unsigned push = nr > 2047 ? 2047 : nr;
+
+			nr -= push;
+
+			BEGIN_RING_NI(curie, NV40TCL_VB_INDEX_BATCH, push);
+			while (push--) {
+				OUT_RING(((0x100 - 1) << 24) | start);
+				start += 0x100;
+			}
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		count -= vc;
+		start = restart;
+	}
+
+	return TRUE;
+}
+
+boolean
+nv40_draw_elements(struct pipe_context *pipe,
+		   struct pipe_buffer *indexBuffer, unsigned indexSize,
+		   unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	boolean idxbuf;
+
+	idxbuf = nv40_vbo_set_idxbuf(nv40, indexBuffer, indexSize);
+	if (FORCE_SWTNL || !nv40_state_validate(nv40)) {
+		return nv40_draw_elements_swtnl(pipe, NULL, 0,
+						mode, start, count);
+	}
+
+	if (idxbuf) {
+		nv40_draw_elements_vbo(pipe, mode, start, count);
+	} else {
+		nv40_draw_elements_inline(pipe, indexBuffer, indexSize,
+					  mode, start, count);
+	}
+
+	pipe->flush(pipe, 0, NULL);
+	return TRUE;
+}
+
+static boolean
+nv40_vbo_validate(struct nv40_context *nv40)
+{
+	struct nouveau_stateobj *vtxbuf, *vtxfmt, *sattr = NULL;
+	struct nouveau_grobj *curie = nv40->screen->curie;
+	struct pipe_buffer *ib = nv40->idxbuf;
+	unsigned ib_format = nv40->idxbuf_format;
+	unsigned vb_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+	int hw;
+
+	if (nv40->edgeflags) {
+		nv40->fallback_swtnl |= NV40_NEW_ARRAYS;
+		return FALSE;
+	}
+
+	vtxbuf = so_new(20, 18);
+	so_method(vtxbuf, curie, NV40TCL_VTXBUF_ADDRESS(0), nv40->vtxelt_nr);
+	vtxfmt = so_new(17, 0);
+	so_method(vtxfmt, curie, NV40TCL_VTXFMT(0), nv40->vtxelt_nr);
+
+	for (hw = 0; hw < nv40->vtxelt_nr; hw++) {
+		struct pipe_vertex_element *ve;
+		struct pipe_vertex_buffer *vb;
+		unsigned type, ncomp;
+
+		ve = &nv40->vtxelt[hw];
+		vb = &nv40->vtxbuf[ve->vertex_buffer_index];
+
+		if (!vb->pitch) {
+			if (!sattr)
+				sattr = so_new(16 * 5, 0);
+
+			if (nv40_vbo_static_attrib(nv40, sattr, hw, ve, vb)) {
+				so_data(vtxbuf, 0);
+				so_data(vtxfmt, NV40TCL_VTXFMT_TYPE_FLOAT);
+				continue;
+			}
+		}
+
+		if (nv40_vbo_format_to_hw(ve->src_format, &type, &ncomp)) {
+			nv40->fallback_swtnl |= NV40_NEW_ARRAYS;
+			so_ref(NULL, &vtxbuf);
+			so_ref(NULL, &vtxfmt);
+			return FALSE;
+		}
+
+		so_reloc(vtxbuf, vb->buffer, vb->buffer_offset + ve->src_offset,
+			 vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+			 0, NV40TCL_VTXBUF_ADDRESS_DMA1);
+		so_data (vtxfmt, ((vb->pitch << NV40TCL_VTXFMT_STRIDE_SHIFT) |
+				  (ncomp << NV40TCL_VTXFMT_SIZE_SHIFT) | type));
+	}
+
+	if (ib) {
+		so_method(vtxbuf, curie, NV40TCL_IDXBUF_ADDRESS, 2);
+		so_reloc (vtxbuf, ib, 0, vb_flags | NOUVEAU_BO_LOW, 0, 0);
+		so_reloc (vtxbuf, ib, ib_format, vb_flags | NOUVEAU_BO_OR,
+			  0, NV40TCL_IDXBUF_FORMAT_DMA1);
+	}
+
+	so_method(vtxbuf, curie, 0x1710, 1);
+	so_data  (vtxbuf, 0);
+
+	so_ref(vtxbuf, &nv40->state.hw[NV40_STATE_VTXBUF]);
+	nv40->state.dirty |= (1ULL << NV40_STATE_VTXBUF);
+	so_ref(vtxfmt, &nv40->state.hw[NV40_STATE_VTXFMT]);
+	nv40->state.dirty |= (1ULL << NV40_STATE_VTXFMT);
+	so_ref(sattr, &nv40->state.hw[NV40_STATE_VTXATTR]);
+	nv40->state.dirty |= (1ULL << NV40_STATE_VTXATTR);
+	return FALSE;
+}
+
+struct nv40_state_entry nv40_state_vbo = {
+	.validate = nv40_vbo_validate,
+	.dirty = {
+		.pipe = NV40_NEW_ARRAYS,
+		.hw = 0,
+	}
+};
+
diff --git a/src/gallium/drivers/nv40/nv40_vertprog.c b/src/gallium/drivers/nv40/nv40_vertprog.c
new file mode 100644
index 00000000000..ff988e6a5f4
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_vertprog.c
@@ -0,0 +1,1070 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv40_context.h"
+#include "nv40_state.h"
+
+/* TODO (at least...):
+ *  1. Indexed consts  + ARL
+ *  3. NV_vp11, NV_vp2, NV_vp3 features
+ *       - extra arith opcodes
+ *       - branching
+ *       - texture sampling
+ *       - indexed attribs
+ *       - indexed results
+ *  4. bugs
+ */
+
+#define SWZ_X 0
+#define SWZ_Y 1
+#define SWZ_Z 2
+#define SWZ_W 3
+#define MASK_X 8
+#define MASK_Y 4
+#define MASK_Z 2
+#define MASK_W 1
+#define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
+#define DEF_SCALE 0
+#define DEF_CTEST 0
+#include "nv40_shader.h"
+
+#define swz(s,x,y,z,w) nv40_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
+#define neg(s) nv40_sr_neg((s))
+#define abs(s) nv40_sr_abs((s))
+
+#define NV40_VP_INST_DEST_CLIP(n) ((~0 - 6) + (n))
+
+struct nv40_vpc {
+	struct nv40_vertex_program *vp;
+
+	struct nv40_vertex_program_exec *vpi;
+
+	unsigned r_temps;
+	unsigned r_temps_discard;
+	struct nv40_sreg r_result[PIPE_MAX_SHADER_OUTPUTS];
+	struct nv40_sreg *r_address;
+	struct nv40_sreg *r_temp;
+
+	struct nv40_sreg *imm;
+	unsigned nr_imm;
+
+	unsigned hpos_idx;
+};
+
+static struct nv40_sreg
+temp(struct nv40_vpc *vpc)
+{
+	int idx = ffs(~vpc->r_temps) - 1;
+
+	if (idx < 0) {
+		NOUVEAU_ERR("out of temps!!\n");
+		assert(0);
+		return nv40_sr(NV40SR_TEMP, 0);
+	}
+
+	vpc->r_temps |= (1 << idx);
+	vpc->r_temps_discard |= (1 << idx);
+	return nv40_sr(NV40SR_TEMP, idx);
+}
+
+static INLINE void
+release_temps(struct nv40_vpc *vpc)
+{
+	vpc->r_temps &= ~vpc->r_temps_discard;
+	vpc->r_temps_discard = 0;
+}
+
+static struct nv40_sreg
+constant(struct nv40_vpc *vpc, int pipe, float x, float y, float z, float w)
+{
+	struct nv40_vertex_program *vp = vpc->vp;
+	struct nv40_vertex_program_data *vpd;
+	int idx;
+
+	if (pipe >= 0) {
+		for (idx = 0; idx < vp->nr_consts; idx++) {
+			if (vp->consts[idx].index == pipe)
+				return nv40_sr(NV40SR_CONST, idx);
+		}
+	}
+
+	idx = vp->nr_consts++;
+	vp->consts = realloc(vp->consts, sizeof(*vpd) * vp->nr_consts);
+	vpd = &vp->consts[idx];
+
+	vpd->index = pipe;
+	vpd->value[0] = x;
+	vpd->value[1] = y;
+	vpd->value[2] = z;
+	vpd->value[3] = w;
+	return nv40_sr(NV40SR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nv40_vp_arith((cc), (s), NV40_VP_INST_##o, (d), (m), (s0), (s1), (s2))
+
+static void
+emit_src(struct nv40_vpc *vpc, uint32_t *hw, int pos, struct nv40_sreg src)
+{
+	struct nv40_vertex_program *vp = vpc->vp;
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NV40SR_TEMP:
+		sr |= (NV40_VP_SRC_REG_TYPE_TEMP << NV40_VP_SRC_REG_TYPE_SHIFT);
+		sr |= (src.index << NV40_VP_SRC_TEMP_SRC_SHIFT);
+		break;
+	case NV40SR_INPUT:
+		sr |= (NV40_VP_SRC_REG_TYPE_INPUT <<
+		       NV40_VP_SRC_REG_TYPE_SHIFT);
+		vp->ir |= (1 << src.index);
+		hw[1] |= (src.index << NV40_VP_INST_INPUT_SRC_SHIFT);
+		break;
+	case NV40SR_CONST:
+		sr |= (NV40_VP_SRC_REG_TYPE_CONST <<
+		       NV40_VP_SRC_REG_TYPE_SHIFT);
+		assert(vpc->vpi->const_index == -1 ||
+		       vpc->vpi->const_index == src.index);
+		vpc->vpi->const_index = src.index;
+		break;
+	case NV40SR_NONE:
+		sr |= (NV40_VP_SRC_REG_TYPE_INPUT <<
+		       NV40_VP_SRC_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NV40_VP_SRC_NEGATE;
+
+	if (src.abs)
+		hw[0] |= (1 << (21 + pos));
+
+	sr |= ((src.swz[0] << NV40_VP_SRC_SWZ_X_SHIFT) |
+	       (src.swz[1] << NV40_VP_SRC_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NV40_VP_SRC_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NV40_VP_SRC_SWZ_W_SHIFT));
+
+	switch (pos) {
+	case 0:
+		hw[1] |= ((sr & NV40_VP_SRC0_HIGH_MASK) >>
+			  NV40_VP_SRC0_HIGH_SHIFT) << NV40_VP_INST_SRC0H_SHIFT;
+		hw[2] |= (sr & NV40_VP_SRC0_LOW_MASK) <<
+			  NV40_VP_INST_SRC0L_SHIFT;
+		break;
+	case 1:
+		hw[2] |= sr << NV40_VP_INST_SRC1_SHIFT;
+		break;
+	case 2:
+		hw[2] |= ((sr & NV40_VP_SRC2_HIGH_MASK) >>
+			  NV40_VP_SRC2_HIGH_SHIFT) << NV40_VP_INST_SRC2H_SHIFT;
+		hw[3] |= (sr & NV40_VP_SRC2_LOW_MASK) <<
+			  NV40_VP_INST_SRC2L_SHIFT;
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+emit_dst(struct nv40_vpc *vpc, uint32_t *hw, int slot, struct nv40_sreg dst)
+{
+	struct nv40_vertex_program *vp = vpc->vp;
+
+	switch (dst.type) {
+	case NV40SR_TEMP:
+		hw[3] |= NV40_VP_INST_DEST_MASK;
+		if (slot == 0) {
+			hw[0] |= (dst.index <<
+				  NV40_VP_INST_VEC_DEST_TEMP_SHIFT);
+		} else {
+			hw[3] |= (dst.index << 
+				  NV40_VP_INST_SCA_DEST_TEMP_SHIFT);
+		}
+		break;
+	case NV40SR_OUTPUT:
+		switch (dst.index) {
+		case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
+		case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
+		case NV40_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
+		case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
+		case NV40_VP_INST_DEST_FOGC : vp->or |= (1 << 4); break;
+		case NV40_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
+		case NV40_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
+		case NV40_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
+		case NV40_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
+		case NV40_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
+		case NV40_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
+		case NV40_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
+		case NV40_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
+		case NV40_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
+		case NV40_VP_INST_DEST_CLIP(0):
+			vp->or |= (1 << 6);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE0;
+			dst.index = NV40_VP_INST_DEST_FOGC;
+			break;
+		case NV40_VP_INST_DEST_CLIP(1):
+			vp->or |= (1 << 7);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE1;
+			dst.index = NV40_VP_INST_DEST_FOGC;
+			break;
+		case NV40_VP_INST_DEST_CLIP(2):
+			vp->or |= (1 << 8);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE2;
+			dst.index = NV40_VP_INST_DEST_FOGC;
+			break;
+		case NV40_VP_INST_DEST_CLIP(3):
+			vp->or |= (1 << 9);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE3;
+			dst.index = NV40_VP_INST_DEST_PSZ;
+			break;
+		case NV40_VP_INST_DEST_CLIP(4):
+			vp->or |= (1 << 10);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE4;
+			dst.index = NV40_VP_INST_DEST_PSZ;
+			break;
+		case NV40_VP_INST_DEST_CLIP(5):
+			vp->or |= (1 << 11);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE5;
+			dst.index = NV40_VP_INST_DEST_PSZ;
+			break;
+		default:
+			break;
+		}
+
+		hw[3] |= (dst.index << NV40_VP_INST_DEST_SHIFT);
+		if (slot == 0) {
+			hw[0] |= NV40_VP_INST_VEC_RESULT;
+			hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
+		} else {
+			hw[3] |= NV40_VP_INST_SCA_RESULT;
+			hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
+		}
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+nv40_vp_arith(struct nv40_vpc *vpc, int slot, int op,
+	      struct nv40_sreg dst, int mask,
+	      struct nv40_sreg s0, struct nv40_sreg s1,
+	      struct nv40_sreg s2)
+{
+	struct nv40_vertex_program *vp = vpc->vp;
+	uint32_t *hw;
+
+	vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
+	vpc->vpi = &vp->insns[vp->nr_insns - 1];
+	memset(vpc->vpi, 0, sizeof(*vpc->vpi));
+	vpc->vpi->const_index = -1;
+
+	hw = vpc->vpi->data;
+
+	hw[0] |= (NV40_VP_INST_COND_TR << NV40_VP_INST_COND_SHIFT);
+	hw[0] |= ((0 << NV40_VP_INST_COND_SWZ_X_SHIFT) |
+		  (1 << NV40_VP_INST_COND_SWZ_Y_SHIFT) |
+		  (2 << NV40_VP_INST_COND_SWZ_Z_SHIFT) |
+		  (3 << NV40_VP_INST_COND_SWZ_W_SHIFT));
+
+	if (slot == 0) {
+		hw[1] |= (op << NV40_VP_INST_VEC_OPCODE_SHIFT);
+		hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
+		hw[3] |= (mask << NV40_VP_INST_VEC_WRITEMASK_SHIFT);
+	} else {
+		hw[1] |= (op << NV40_VP_INST_SCA_OPCODE_SHIFT);
+		hw[0] |= (NV40_VP_INST_VEC_DEST_TEMP_MASK | (1 << 20));
+		hw[3] |= (mask << NV40_VP_INST_SCA_WRITEMASK_SHIFT);
+	}
+
+	emit_dst(vpc, hw, slot, dst);
+	emit_src(vpc, hw, 0, s0);
+	emit_src(vpc, hw, 1, s1);
+	emit_src(vpc, hw, 2, s2);
+}
+
+static INLINE struct nv40_sreg
+tgsi_src(struct nv40_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
+	struct nv40_sreg src;
+
+	switch (fsrc->SrcRegister.File) {
+	case TGSI_FILE_INPUT:
+		src = nv40_sr(NV40SR_INPUT, fsrc->SrcRegister.Index);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(vpc, fsrc->SrcRegister.Index, 0, 0, 0, 0);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		src = vpc->imm[fsrc->SrcRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		src = vpc->r_temp[fsrc->SrcRegister.Index];
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->SrcRegisterExtMod.Absolute;
+	src.negate = fsrc->SrcRegister.Negate;
+	src.swz[0] = fsrc->SrcRegister.SwizzleX;
+	src.swz[1] = fsrc->SrcRegister.SwizzleY;
+	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
+	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	return src;
+}
+
+static INLINE struct nv40_sreg
+tgsi_dst(struct nv40_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
+	struct nv40_sreg dst;
+
+	switch (fdst->DstRegister.File) {
+	case TGSI_FILE_OUTPUT:
+		dst = vpc->r_result[fdst->DstRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		dst = vpc->r_temp[fdst->DstRegister.Index];
+		break;
+	case TGSI_FILE_ADDRESS:
+		dst = vpc->r_address[fdst->DstRegister.Index];
+		break;
+	default:
+		NOUVEAU_ERR("bad dst file\n");
+		break;
+	}
+
+	return dst;
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
+	return mask;
+}
+
+static boolean
+src_native_swz(struct nv40_vpc *vpc, const struct tgsi_full_src_register *fsrc,
+	       struct nv40_sreg *src)
+{
+	const struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
+	struct nv40_sreg tgsi = tgsi_src(vpc, fsrc);
+	uint mask = 0, zero_mask = 0, one_mask = 0, neg_mask = 0;
+	uint neg[4] = { fsrc->SrcRegisterExtSwz.NegateX,
+			fsrc->SrcRegisterExtSwz.NegateY,
+			fsrc->SrcRegisterExtSwz.NegateZ,
+			fsrc->SrcRegisterExtSwz.NegateW };
+	uint c;
+
+	for (c = 0; c < 4; c++) {
+		switch (tgsi_util_get_full_src_register_extswizzle(fsrc, c)) {
+		case TGSI_EXTSWIZZLE_X:
+		case TGSI_EXTSWIZZLE_Y:
+		case TGSI_EXTSWIZZLE_Z:
+		case TGSI_EXTSWIZZLE_W:
+			mask |= tgsi_mask(1 << c);
+			break;
+		case TGSI_EXTSWIZZLE_ZERO:
+			zero_mask |= tgsi_mask(1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		case TGSI_EXTSWIZZLE_ONE:
+			one_mask |= tgsi_mask(1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		default:
+			assert(0);
+		}
+
+		if (!tgsi.negate && neg[c])
+			neg_mask |= tgsi_mask(1 << c);
+	}
+
+	if (mask == MASK_ALL && !neg_mask)
+		return TRUE;
+
+	*src = temp(vpc);
+
+	if (mask)
+		arith(vpc, 0, OP_MOV, *src, mask, tgsi, none, none);
+
+	if (zero_mask)
+		arith(vpc, 0, OP_SFL, *src, zero_mask, *src, none, none);
+
+	if (one_mask)
+		arith(vpc, 0, OP_STR, *src, one_mask, *src, none, none);
+
+	if (neg_mask) {
+		struct nv40_sreg one = temp(vpc);
+		arith(vpc, 0, OP_STR, one, neg_mask, one, none, none);
+		arith(vpc, 0, OP_MUL, *src, neg_mask, *src, neg(one), none);
+	}
+
+	return FALSE;
+}
+
+static boolean
+nv40_vertprog_parse_instruction(struct nv40_vpc *vpc,
+				const struct tgsi_full_instruction *finst)
+{
+	struct nv40_sreg src[3], dst, tmp;
+	struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
+	int mask;
+	int ai = -1, ci = -1, ii = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(vpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_TEMPORARY:
+			if (!src_native_swz(vpc, fsrc, &src[i]))
+				continue;
+			break;
+		default:
+			break;
+		}
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
+				ai = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_CONSTANT:
+			if ((ci == -1 && ii == -1) ||
+			    ci == fsrc->SrcRegister.Index) {
+				ci = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_IMMEDIATE:
+			if ((ci == -1 && ii == -1) ||
+			    ii == fsrc->SrcRegister.Index) {
+				ii = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(vpc, &finst->FullDstRegisters[0]);
+	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(vpc, 0, OP_MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, src[1]);
+		break;
+	case TGSI_OPCODE_ARL:
+		arith(vpc, 0, OP_ARL, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(vpc, 0, OP_DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(vpc, 0, OP_DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		arith(vpc, 0, OP_DPH, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(vpc, 0, OP_DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(vpc, 1, OP_EX2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_EXP:
+		arith(vpc, 1, OP_EXP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(vpc, 0, OP_FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(vpc, 0, OP_FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(vpc, 1, OP_LG2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LIT:
+		arith(vpc, 1, OP_LIT, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LOG:
+		arith(vpc, 1, OP_LOG, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(vpc, 0, OP_MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(vpc, 0, OP_MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(vpc, 0, OP_MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(vpc, 0, OP_MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(vpc, 0, OP_MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_POW:
+		tmp = temp(vpc);
+		arith(vpc, 1, OP_LG2, tmp, MASK_X, none, none,
+		      swz(src[0], X, X, X, X));
+		arith(vpc, 0, OP_MUL, tmp, MASK_X, swz(tmp, X, X, X, X),
+		      swz(src[1], X, X, X, X), none);
+		arith(vpc, 1, OP_EX2, dst, mask, none, none,
+		      swz(tmp, X, X, X, X));
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(vpc, 1, OP_RCP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_RET:
+		break;
+	case TGSI_OPCODE_RSQ:
+		arith(vpc, 1, OP_RSQ, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(vpc, 0, OP_SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(vpc, 0, OP_SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, neg(src[1]));
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(vpc);
+		arith(vpc, 0, OP_MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(vpc, 0, OP_MAD, dst, (mask & ~MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	release_temps(vpc);
+	return TRUE;
+}
+
+static boolean
+nv40_vertprog_parse_decl_output(struct nv40_vpc *vpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	unsigned idx = fdec->DeclarationRange.First;
+	int hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NV40_VP_INST_DEST_POS;
+		vpc->hpos_idx = idx;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV40_VP_INST_DEST_COL0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV40_VP_INST_DEST_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_BCOLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV40_VP_INST_DEST_BFC0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV40_VP_INST_DEST_BFC1;
+		} else {
+			NOUVEAU_ERR("bad bcolour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NV40_VP_INST_DEST_FOGC;
+		break;
+	case TGSI_SEMANTIC_PSIZE:
+		hw = NV40_VP_INST_DEST_PSZ;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.SemanticIndex <= 7) {
+			hw = NV40_VP_INST_DEST_TC(fdec->Semantic.SemanticIndex);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	vpc->r_result[idx] = nv40_sr(NV40SR_OUTPUT, hw);
+	return TRUE;
+}
+
+static boolean
+nv40_vertprog_prepare(struct nv40_vpc *vpc)
+{
+	struct tgsi_parse_context p;
+	int high_temp = -1, high_addr = -1, nr_imm = 0, i;
+
+	tgsi_parse_init(&p, vpc->vp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+			nr_imm++;
+			break;
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+
+			fdec = &p.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_TEMPORARY:
+				if (fdec->DeclarationRange.Last > high_temp) {
+					high_temp =
+						fdec->DeclarationRange.Last;
+				}
+				break;
+#if 0 /* this would be nice.. except gallium doesn't track it */
+			case TGSI_FILE_ADDRESS:
+				if (fdec->DeclarationRange.Last > high_addr) {
+					high_addr =
+						fdec->DeclarationRange.Last;
+				}
+				break;
+#endif
+			case TGSI_FILE_OUTPUT:
+				if (!nv40_vertprog_parse_decl_output(vpc, fdec))
+					return FALSE;
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+#if 1 /* yay, parse instructions looking for address regs instead */
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+			const struct tgsi_full_dst_register *fdst;
+
+			finst = &p.FullToken.FullInstruction;
+			fdst = &finst->FullDstRegisters[0];
+
+			if (fdst->DstRegister.File == TGSI_FILE_ADDRESS) {
+				if (fdst->DstRegister.Index > high_addr)
+					high_addr = fdst->DstRegister.Index;
+			}
+		
+		}
+			break;
+#endif
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	if (nr_imm) {
+		vpc->imm = CALLOC(nr_imm, sizeof(struct nv40_sreg));
+		assert(vpc->imm);
+	}
+
+	if (++high_temp) {
+		vpc->r_temp = CALLOC(high_temp, sizeof(struct nv40_sreg));
+		for (i = 0; i < high_temp; i++)
+			vpc->r_temp[i] = temp(vpc);
+	}
+
+	if (++high_addr) {
+		vpc->r_address = CALLOC(high_addr, sizeof(struct nv40_sreg));
+		for (i = 0; i < high_addr; i++)
+			vpc->r_address[i] = temp(vpc);
+	}
+
+	vpc->r_temps_discard = 0;
+	return TRUE;
+}
+
+static void
+nv40_vertprog_translate(struct nv40_context *nv40,
+			struct nv40_vertex_program *vp)
+{
+	struct tgsi_parse_context parse;
+	struct nv40_vpc *vpc = NULL;
+	struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
+	int i;
+
+	vpc = CALLOC(1, sizeof(struct nv40_vpc));
+	if (!vpc)
+		return;
+	vpc->vp = vp;
+
+	if (!nv40_vertprog_prepare(vpc)) {
+		FREE(vpc);
+		return;
+	}
+
+	/* Redirect post-transform vertex position to a temp if user clip
+	 * planes are enabled.  We need to append code the the vtxprog
+	 * to handle clip planes later.
+	 */
+	if (vp->ucp.nr)  {
+		vpc->r_result[vpc->hpos_idx] = temp(vpc);
+		vpc->r_temps_discard = 0;
+	}
+
+	tgsi_parse_init(&parse, vp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			const struct tgsi_full_immediate *imm;
+
+			imm = &parse.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+//			assert(imm->Immediate.Size == 4);
+			vpc->imm[vpc->nr_imm++] =
+				constant(vpc, -1,
+					 imm->u.ImmediateFloat32[0].Float,
+					 imm->u.ImmediateFloat32[1].Float,
+					 imm->u.ImmediateFloat32[2].Float,
+					 imm->u.ImmediateFloat32[3].Float);
+		}
+			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+			finst = &parse.FullToken.FullInstruction;
+			if (!nv40_vertprog_parse_instruction(vpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	/* Write out HPOS if it was redirected to a temp earlier */
+	if (vpc->r_result[vpc->hpos_idx].type != NV40SR_OUTPUT) {
+		struct nv40_sreg hpos = nv40_sr(NV40SR_OUTPUT,
+						NV40_VP_INST_DEST_POS);
+		struct nv40_sreg htmp = vpc->r_result[vpc->hpos_idx];
+
+		arith(vpc, 0, OP_MOV, hpos, MASK_ALL, htmp, none, none);
+	}
+
+	/* Insert code to handle user clip planes */
+	for (i = 0; i < vp->ucp.nr; i++) {
+		struct nv40_sreg cdst = nv40_sr(NV40SR_OUTPUT,
+						NV40_VP_INST_DEST_CLIP(i));
+		struct nv40_sreg ceqn = constant(vpc, -1,
+						 nv40->clip.ucp[i][0],
+						 nv40->clip.ucp[i][1],
+						 nv40->clip.ucp[i][2],
+						 nv40->clip.ucp[i][3]);
+		struct nv40_sreg htmp = vpc->r_result[vpc->hpos_idx];
+		unsigned mask;
+
+		switch (i) {
+		case 0: case 3: mask = MASK_Y; break;
+		case 1: case 4: mask = MASK_Z; break;
+		case 2: case 5: mask = MASK_W; break;
+		default:
+			NOUVEAU_ERR("invalid clip dist #%d\n", i);
+			goto out_err;
+		}
+
+		arith(vpc, 0, OP_DP4, cdst, mask, htmp, ceqn, none);
+	}
+
+	vp->insns[vp->nr_insns - 1].data[3] |= NV40_VP_INST_LAST;
+	vp->translated = TRUE;
+out_err:
+	tgsi_parse_free(&parse);
+	if (vpc->r_temp)
+		FREE(vpc->r_temp); 
+	if (vpc->r_address)
+		FREE(vpc->r_address); 
+	if (vpc->imm)	
+		FREE(vpc->imm); 
+	FREE(vpc);
+}
+
+static boolean
+nv40_vertprog_validate(struct nv40_context *nv40)
+{ 
+	struct nouveau_winsys *nvws = nv40->nvws;
+	struct pipe_winsys *ws = nv40->pipe.winsys;
+	struct nouveau_grobj *curie = nv40->screen->curie;
+	struct nv40_vertex_program *vp;
+	struct pipe_buffer *constbuf;
+	boolean upload_code = FALSE, upload_data = FALSE;
+	int i;
+
+	if (nv40->render_mode == HW) {
+		vp = nv40->vertprog;
+		constbuf = nv40->constbuf[PIPE_SHADER_VERTEX];
+
+		if ((nv40->dirty & NV40_NEW_UCP) ||
+		    memcmp(&nv40->clip, &vp->ucp, sizeof(vp->ucp))) {
+			nv40_vertprog_destroy(nv40, vp);
+			memcpy(&vp->ucp, &nv40->clip, sizeof(vp->ucp));
+		}
+	} else {
+		vp = nv40->swtnl.vertprog;
+		constbuf = NULL;
+	}
+
+	/* Translate TGSI shader into hw bytecode */
+	if (vp->translated)
+		goto check_gpu_resources;
+
+	nv40->fallback_swtnl &= ~NV40_NEW_VERTPROG;
+	nv40_vertprog_translate(nv40, vp);
+	if (!vp->translated) {
+		nv40->fallback_swtnl |= NV40_NEW_VERTPROG;
+		return FALSE;
+	}
+
+check_gpu_resources:
+	/* Allocate hw vtxprog exec slots */
+	if (!vp->exec) {
+		struct nouveau_resource *heap = nv40->screen->vp_exec_heap;
+		struct nouveau_stateobj *so;
+		uint vplen = vp->nr_insns;
+
+		if (nvws->res_alloc(heap, vplen, vp, &vp->exec)) {
+			while (heap->next && heap->size < vplen) {
+				struct nv40_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->exec);
+			}
+
+			if (nvws->res_alloc(heap, vplen, vp, &vp->exec))
+				assert(0);
+		}
+
+		so = so_new(7, 0);
+		so_method(so, curie, NV40TCL_VP_START_FROM_ID, 1);
+		so_data  (so, vp->exec->start);
+		so_method(so, curie, NV40TCL_VP_ATTRIB_EN, 2);
+		so_data  (so, vp->ir);
+		so_data  (so, vp->or);
+		so_method(so, curie,  NV40TCL_CLIP_PLANE_ENABLE, 1);
+		so_data  (so, vp->clip_ctrl);
+		so_ref(so, &vp->so);
+
+		upload_code = TRUE;
+	}
+
+	/* Allocate hw vtxprog const slots */
+	if (vp->nr_consts && !vp->data) {
+		struct nouveau_resource *heap = nv40->screen->vp_data_heap;
+
+		if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data)) {
+			while (heap->next && heap->size < vp->nr_consts) {
+				struct nv40_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->data);
+			}
+
+			if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data))
+				assert(0);
+		}
+
+		/*XXX: handle this some day */
+		assert(vp->data->start >= vp->data_start_min);
+
+		upload_data = TRUE;
+		if (vp->data_start != vp->data->start)
+			upload_code = TRUE;
+	}
+
+	/* If exec or data segments moved we need to patch the program to
+	 * fixup offsets and register IDs.
+	 */
+	if (vp->exec_start != vp->exec->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv40_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->has_branch_offset) {
+				assert(0);
+			}
+		}
+
+		vp->exec_start = vp->exec->start;
+	}
+
+	if (vp->nr_consts && vp->data_start != vp->data->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv40_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->const_index >= 0) {
+				vpi->data[1] &= ~NV40_VP_INST_CONST_SRC_MASK;
+				vpi->data[1] |=
+					(vpi->const_index + vp->data->start) <<
+					NV40_VP_INST_CONST_SRC_SHIFT;
+
+			}
+		}
+
+		vp->data_start = vp->data->start;
+	}
+
+	/* Update + Upload constant values */
+	if (vp->nr_consts) {
+		float *map = NULL;
+
+		if (constbuf) {
+			map = ws->buffer_map(ws, constbuf,
+					     PIPE_BUFFER_USAGE_CPU_READ);
+		}
+
+		for (i = 0; i < vp->nr_consts; i++) {
+			struct nv40_vertex_program_data *vpd = &vp->consts[i];
+
+			if (vpd->index >= 0) {
+				if (!upload_data &&
+				    !memcmp(vpd->value, &map[vpd->index * 4],
+					    4 * sizeof(float)))
+					continue;
+				memcpy(vpd->value, &map[vpd->index * 4],
+				       4 * sizeof(float));
+			}
+
+			BEGIN_RING(curie, NV40TCL_VP_UPLOAD_CONST_ID, 5);
+			OUT_RING  (i + vp->data->start);
+			OUT_RINGp ((uint32_t *)vpd->value, 4);
+		}
+
+		if (constbuf)
+			ws->buffer_unmap(ws, constbuf);
+	}
+
+	/* Upload vtxprog */
+	if (upload_code) {
+#if 0
+		for (i = 0; i < vp->nr_insns; i++) {
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[0]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[1]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[2]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[3]);
+		}
+#endif
+		BEGIN_RING(curie, NV40TCL_VP_UPLOAD_FROM_ID, 1);
+		OUT_RING  (vp->exec->start);
+		for (i = 0; i < vp->nr_insns; i++) {
+			BEGIN_RING(curie, NV40TCL_VP_UPLOAD_INST(0), 4);
+			OUT_RINGp (vp->insns[i].data, 4);
+		}
+	}
+
+	if (vp->so != nv40->state.hw[NV40_STATE_VERTPROG]) {
+		so_ref(vp->so, &nv40->state.hw[NV40_STATE_VERTPROG]);
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+void
+nv40_vertprog_destroy(struct nv40_context *nv40, struct nv40_vertex_program *vp)
+{
+	struct nouveau_winsys *nvws = nv40->screen->nvws;
+
+	vp->translated = FALSE;
+
+	if (vp->nr_insns) {
+		FREE(vp->insns);
+		vp->insns = NULL;
+		vp->nr_insns = 0;
+	}
+
+	if (vp->nr_consts) {
+		FREE(vp->consts);
+		vp->consts = NULL;
+		vp->nr_consts = 0;
+	}
+
+	nvws->res_free(&vp->exec);
+	vp->exec_start = 0;
+	nvws->res_free(&vp->data);
+	vp->data_start = 0;
+	vp->data_start_min = 0;
+
+	vp->ir = vp->or = vp->clip_ctrl = 0;
+	so_ref(NULL, &vp->so);
+}
+
+struct nv40_state_entry nv40_state_vertprog = {
+	.validate = nv40_vertprog_validate,
+	.dirty = {
+		.pipe = NV40_NEW_VERTPROG | NV40_NEW_UCP,
+		.hw = NV40_STATE_VERTPROG,
+	}
+};
+
diff --git a/src/gallium/drivers/nv50/Makefile b/src/gallium/drivers/nv50/Makefile
new file mode 100644
index 00000000000..be30400c039
--- /dev/null
+++ b/src/gallium/drivers/nv50/Makefile
@@ -0,0 +1,29 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv50
+
+DRIVER_SOURCES = \
+	nv50_clear.c \
+	nv50_context.c \
+	nv50_draw.c \
+	nv50_miptree.c \
+	nv50_query.c \
+	nv50_program.c \
+	nv50_screen.c \
+	nv50_state.c \
+	nv50_state_validate.c \
+	nv50_surface.c \
+	nv50_tex.c \
+	nv50_vbo.c
+
+C_SOURCES = \
+	$(COMMON_SOURCES) \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/drivers/nv50/nv50_clear.c b/src/gallium/drivers/nv50/nv50_clear.c
new file mode 100644
index 00000000000..a31a42d6b54
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_clear.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv50_context.h"
+
+void
+nv50_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct pipe_framebuffer_state fb, s_fb = nv50->framebuffer;
+	struct pipe_scissor_state sc, s_sc = nv50->scissor;
+	unsigned dirty = nv50->dirty;
+
+	nv50->dirty = 0;
+
+	if (ps->format == PIPE_FORMAT_Z24S8_UNORM ||
+	    ps->format == PIPE_FORMAT_Z16_UNORM) {
+		fb.num_cbufs = 0;
+		fb.zsbuf = ps;
+	} else {
+		fb.num_cbufs = 1;
+		fb.cbufs[0] = ps;
+		fb.zsbuf = NULL;
+	}
+	fb.width = ps->width;
+	fb.height = ps->height;
+	pipe->set_framebuffer_state(pipe, &fb);
+
+	sc.minx = sc.miny = 0;
+	sc.maxx = fb.width;
+	sc.maxy = fb.height;
+	pipe->set_scissor_state(pipe, &sc);
+
+	nv50_state_validate(nv50);
+
+	switch (ps->format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+		BEGIN_RING(tesla, 0x0d80, 4);
+		OUT_RINGf (ubyte_to_float((clearValue >> 16) & 0xff));
+		OUT_RINGf (ubyte_to_float((clearValue >>  8) & 0xff));
+		OUT_RINGf (ubyte_to_float((clearValue >>  0) & 0xff));
+		OUT_RINGf (ubyte_to_float((clearValue >> 24) & 0xff));
+		BEGIN_RING(tesla, 0x19d0, 1);
+		OUT_RING  (0x3c);
+		break;
+	case PIPE_FORMAT_Z24S8_UNORM:
+		BEGIN_RING(tesla, 0x0d90, 1);
+		OUT_RINGf ((float)(clearValue >> 8) * (1.0 / 16777215.0));
+		BEGIN_RING(tesla, 0x0da0, 1);
+		OUT_RING  (clearValue & 0xff);
+		BEGIN_RING(tesla, 0x19d0, 1);
+		OUT_RING  (0x03);
+		break;
+	default:
+		pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height,
+				   clearValue);
+		break;
+	}
+
+	pipe->set_framebuffer_state(pipe, &s_fb);
+	pipe->set_scissor_state(pipe, &s_sc);
+	nv50->dirty |= dirty;
+
+	ps->status = PIPE_SURFACE_STATUS_CLEAR;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_context.c b/src/gallium/drivers/nv50/nv50_context.c
new file mode 100644
index 00000000000..b02c53f2095
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_context.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+
+#include "nv50_context.h"
+#include "nv50_screen.h"
+
+static void
+nv50_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nv50_context *nv50 = (struct nv50_context *)pipe;
+	
+	FIRE_RING(fence);
+}
+
+static void
+nv50_destroy(struct pipe_context *pipe)
+{
+	struct nv50_context *nv50 = (struct nv50_context *)pipe;
+
+	draw_destroy(nv50->draw);
+	FREE(nv50);
+}
+
+
+static void
+nv50_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
+{
+}
+
+struct pipe_context *
+nv50_create(struct pipe_screen *pscreen, unsigned pctx_id)
+{
+	struct pipe_winsys *pipe_winsys = pscreen->winsys;
+	struct nv50_screen *screen = nv50_screen(pscreen);
+	struct nv50_context *nv50;
+
+	nv50 = CALLOC_STRUCT(nv50_context);
+	if (!nv50)
+		return NULL;
+	nv50->screen = screen;
+	nv50->pctx_id = pctx_id;
+
+	nv50->pipe.winsys = pipe_winsys;
+	nv50->pipe.screen = pscreen;
+
+	nv50->pipe.destroy = nv50_destroy;
+
+	nv50->pipe.set_edgeflags = nv50_set_edgeflags;
+	nv50->pipe.draw_arrays = nv50_draw_arrays;
+	nv50->pipe.draw_elements = nv50_draw_elements;
+	nv50->pipe.clear = nv50_clear;
+
+	nv50->pipe.flush = nv50_flush;
+
+	nv50_init_surface_functions(nv50);
+	nv50_init_state_functions(nv50);
+	nv50_init_query_functions(nv50);
+
+	nv50->draw = draw_create();
+	assert(nv50->draw);
+	draw_set_rasterize_stage(nv50->draw, nv50_draw_render_stage(nv50));
+
+	return &nv50->pipe;
+}
+
+		
diff --git a/src/gallium/drivers/nv50/nv50_context.h b/src/gallium/drivers/nv50/nv50_context.h
new file mode 100644
index 00000000000..5d377f2d067
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_context.h
@@ -0,0 +1,185 @@
+#ifndef __NV50_CONTEXT_H__
+#define __NV50_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+#include "nouveau/nouveau_stateobj.h"
+
+#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
+	struct nv50_screen *ctx = nv50->screen
+#include "nouveau/nouveau_push.h"
+
+#include "nv50_screen.h"
+#include "nv50_program.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+/* Constant buffer assignment */
+#define NV50_CB_PMISC		0
+#define NV50_CB_PVP		1
+#define NV50_CB_PFP		2
+#define NV50_CB_PGP		3
+#define NV50_CB_TIC		4
+#define NV50_CB_TSC		5
+#define NV50_CB_PUPLOAD         6
+
+#define NV50_NEW_BLEND		(1 << 0)
+#define NV50_NEW_ZSA		(1 << 1)
+#define NV50_NEW_BLEND_COLOUR	(1 << 2)
+#define NV50_NEW_STIPPLE	(1 << 3)
+#define NV50_NEW_SCISSOR	(1 << 4)
+#define NV50_NEW_VIEWPORT	(1 << 5)
+#define NV50_NEW_RASTERIZER	(1 << 6)
+#define NV50_NEW_FRAMEBUFFER	(1 << 7)
+#define NV50_NEW_VERTPROG	(1 << 8)
+#define NV50_NEW_VERTPROG_CB	(1 << 9)
+#define NV50_NEW_FRAGPROG	(1 << 10)
+#define NV50_NEW_FRAGPROG_CB	(1 << 11)
+#define NV50_NEW_ARRAYS		(1 << 12)
+#define NV50_NEW_SAMPLER	(1 << 13)
+#define NV50_NEW_TEXTURE	(1 << 14)
+
+struct nv50_blend_stateobj {
+	struct pipe_blend_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv50_zsa_stateobj {
+	struct pipe_depth_stencil_alpha_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv50_rasterizer_stateobj {
+	struct pipe_rasterizer_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv50_miptree {
+	struct pipe_texture base;
+	struct pipe_buffer *buffer;
+};
+
+static INLINE struct nv50_miptree *
+nv50_miptree(struct pipe_texture *pt)
+{
+	return (struct nv50_miptree *)pt;
+}
+
+struct nv50_surface {
+	struct pipe_surface base;
+	struct pipe_buffer *untiled;
+};
+
+static INLINE struct nv50_surface *
+nv50_surface(struct pipe_surface *pt)
+{
+	return (struct nv50_surface *)pt;
+}
+
+struct nv50_state {
+	unsigned dirty;
+
+	struct nouveau_stateobj *fb;
+	struct nouveau_stateobj *blend;
+	struct nouveau_stateobj *blend_colour;
+	struct nouveau_stateobj *zsa;
+	struct nouveau_stateobj *rast;
+	struct nouveau_stateobj *stipple;
+	struct nouveau_stateobj *scissor;
+	unsigned scissor_enabled;
+	struct nouveau_stateobj *viewport;
+	unsigned viewport_bypass;
+	struct nouveau_stateobj *tsc_upload;
+	struct nouveau_stateobj *tic_upload;
+	struct nouveau_stateobj *vertprog;
+	struct nouveau_stateobj *fragprog;
+	struct nouveau_stateobj *vtxfmt;
+	struct nouveau_stateobj *vtxbuf;
+};
+
+struct nv50_context {
+	struct pipe_context pipe;
+
+	struct nv50_screen *screen;
+	unsigned pctx_id;
+
+	struct draw_context *draw;
+
+	struct nv50_state state;
+
+	unsigned dirty;
+	struct nv50_blend_stateobj *blend;
+	struct nv50_zsa_stateobj *zsa;
+	struct nv50_rasterizer_stateobj *rasterizer;
+	struct pipe_blend_color blend_colour;
+	struct pipe_poly_stipple stipple;
+	struct pipe_scissor_state scissor;
+	struct pipe_viewport_state viewport;
+	struct pipe_framebuffer_state framebuffer;
+	struct nv50_program *vertprog;
+	struct nv50_program *fragprog;
+	struct pipe_buffer *constbuf[PIPE_SHADER_TYPES];
+	struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
+	unsigned vtxbuf_nr;
+	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
+	unsigned vtxelt_nr;
+	unsigned *sampler[PIPE_MAX_SAMPLERS];
+	unsigned sampler_nr;
+	struct nv50_miptree *miptree[PIPE_MAX_SAMPLERS];
+	unsigned miptree_nr;
+};
+
+static INLINE struct nv50_context *
+nv50_context(struct pipe_context *pipe)
+{
+	return (struct nv50_context *)pipe;
+}
+
+extern void nv50_init_surface_functions(struct nv50_context *nv50);
+extern void nv50_init_state_functions(struct nv50_context *nv50);
+extern void nv50_init_query_functions(struct nv50_context *nv50);
+
+extern void nv50_screen_init_miptree_functions(struct pipe_screen *pscreen);
+
+/* nv50_draw.c */
+extern struct draw_stage *nv50_draw_render_stage(struct nv50_context *nv50);
+
+/* nv50_vbo.c */
+extern boolean nv50_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv50_draw_elements(struct pipe_context *pipe,
+				  struct pipe_buffer *indexBuffer,
+				  unsigned indexSize,
+				  unsigned mode, unsigned start,
+				  unsigned count);
+extern void nv50_vbo_validate(struct nv50_context *nv50);
+
+/* nv50_clear.c */
+extern void nv50_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+/* nv50_program.c */
+extern void nv50_vertprog_validate(struct nv50_context *nv50);
+extern void nv50_fragprog_validate(struct nv50_context *nv50);
+extern void nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p);
+
+/* nv50_state_validate.c */
+extern boolean nv50_state_validate(struct nv50_context *nv50);
+
+/* nv50_tex.c */
+extern void nv50_tex_validate(struct nv50_context *);
+
+#endif
diff --git a/src/gallium/drivers/nv50/nv50_draw.c b/src/gallium/drivers/nv50/nv50_draw.c
new file mode 100644
index 00000000000..2f6f607261e
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_draw.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "draw/draw_pipe.h"
+
+#include "nv50_context.h"
+
+struct nv50_render_stage {
+	struct draw_stage stage;
+	struct nv50_context *nv50;
+};
+
+static INLINE struct nv50_render_stage *
+nv50_render_stage(struct draw_stage *stage)
+{
+	return (struct nv50_render_stage *)stage;
+}
+
+static void
+nv50_render_point(struct draw_stage *stage, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv50_render_line(struct draw_stage *stage, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv50_render_tri(struct draw_stage *stage, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv50_render_flush(struct draw_stage *stage, unsigned flags)
+{
+}
+
+static void
+nv50_render_reset_stipple_counter(struct draw_stage *stage)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv50_render_destroy(struct draw_stage *stage)
+{
+	FREE(stage);
+}
+
+struct draw_stage *
+nv50_draw_render_stage(struct nv50_context *nv50)
+{
+	struct nv50_render_stage *rs = CALLOC_STRUCT(nv50_render_stage);
+
+	rs->nv50 = nv50;
+	rs->stage.draw = nv50->draw;
+	rs->stage.destroy = nv50_render_destroy;
+	rs->stage.point = nv50_render_point;
+	rs->stage.line = nv50_render_line;
+	rs->stage.tri = nv50_render_tri;
+	rs->stage.flush = nv50_render_flush;
+	rs->stage.reset_stipple_counter = nv50_render_reset_stipple_counter;
+
+	return &rs->stage;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_miptree.c b/src/gallium/drivers/nv50/nv50_miptree.c
new file mode 100644
index 00000000000..28a8bdc0fab
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_miptree.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "nv50_context.h"
+
+static struct pipe_texture *
+nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
+{
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree);
+	unsigned usage, pitch;
+
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+
+	usage = PIPE_BUFFER_USAGE_PIXEL;
+	switch (pt->format) {
+	case PIPE_FORMAT_Z24S8_UNORM:
+	case PIPE_FORMAT_Z16_UNORM:
+		usage |= NOUVEAU_BUFFER_USAGE_ZETA;
+		break;
+	default:
+		break;
+	}
+
+	pitch = ((pt->width[0] + 63) & ~63) * pt->block.size;
+	/*XXX*/
+	pitch *= 2;
+
+	mt->buffer = ws->buffer_create(ws, 256, usage, pitch * pt->height[0]);
+	if (!mt->buffer) {
+		FREE(mt);
+		return NULL;
+	}
+
+	return &mt->base;
+}
+
+static void
+nv50_miptree_release(struct pipe_screen *pscreen, struct pipe_texture **ppt)
+{
+	struct pipe_texture *pt = *ppt;
+
+	*ppt = NULL;
+
+	if (--pt->refcount <= 0) {
+		struct nv50_miptree *mt = nv50_miptree(pt);
+
+		pipe_buffer_reference(pscreen, &mt->buffer, NULL);
+		FREE(mt);
+	}
+}
+
+static struct pipe_surface *
+nv50_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct nv50_miptree *mt = nv50_miptree(pt);
+	struct nv50_surface *s;
+	struct pipe_surface *ps;
+
+	s = CALLOC_STRUCT(nv50_surface);
+	if (!s)
+		return NULL;
+	ps = &s->base;
+
+	ps->refcount = 1;
+	ps->winsys = pscreen->winsys;
+	ps->format = pt->format;
+	ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->block = pt->block;
+	ps->nblocksx = pt->nblocksx[level];
+	ps->nblocksy = pt->nblocksy[level];
+	ps->stride = ps->width * ps->block.size;
+	ps->offset = 0;
+	ps->usage = flags;
+	ps->status = PIPE_SURFACE_STATUS_DEFINED;
+
+	pipe_texture_reference(&ps->texture, pt);
+	pipe_buffer_reference(pscreen, &ps->buffer, mt->buffer);
+
+	return ps;
+}
+
+static void
+nv50_miptree_surface_del(struct pipe_screen *pscreen,
+			 struct pipe_surface **psurface)
+{
+	struct pipe_surface *ps = *psurface;
+	struct nv50_surface *s = nv50_surface(ps);
+
+	*psurface = NULL;
+
+	if (--ps->refcount <= 0) {
+		pipe_texture_reference(&ps->texture, NULL);
+		pipe_buffer_reference(pscreen, &ps->buffer, NULL);
+		FREE(s);
+	}
+}
+
+void
+nv50_screen_init_miptree_functions(struct pipe_screen *pscreen)
+{
+	pscreen->texture_create = nv50_miptree_create;
+	pscreen->texture_release = nv50_miptree_release;
+	pscreen->get_tex_surface = nv50_miptree_surface_new;
+	pscreen->tex_surface_release = nv50_miptree_surface_del;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
new file mode 100644
index 00000000000..d6fbdd18243
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -0,0 +1,1708 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_inlines.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv50_context.h"
+
+#define NV50_SU_MAX_TEMP 64
+#define NV50_PROGRAM_DUMP
+
+/* ARL - gallium craps itself on progs/vp/arl.txt
+ *
+ * MSB - Like MAD, but MUL+SUB
+ * 	- Fuck it off, introduce a way to negate args for ops that
+ * 	  support it.
+ *
+ * Look into inlining IMMD for ops other than MOV (make it general?)
+ * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
+ * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
+ *
+ * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
+ * case, if the emit_src() causes the inst to suddenly become long.
+ *
+ * Verify half-insns work where expected - and force disable them where they
+ * don't work - MUL has it forcibly disabled atm as it fixes POW..
+ *
+ * FUCK! watch dst==src vectors, can overwrite components that are needed.
+ * 	ie. SUB R0, R0.yzxw, R0
+ *
+ * Things to check with renouveau:
+ * 	FP attr/result assignment - how?
+ * 		attrib
+ * 			- 0x16bc maps vp output onto fp hpos
+ * 			- 0x16c0 maps vp output onto fp col0
+ * 		result
+ * 			- colr always 0-3
+ * 			- depr always 4
+ * 0x16bc->0x16e8 --> some binding between vp/fp regs
+ * 0x16b8 --> VP output count
+ *
+ * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
+ * 	      "MOV rcol.x, fcol.y" = 0x00000004
+ * 0x19a8 --> as above but 0x00000100 and 0x00000000
+ * 	- 0x00100000 used when KIL used
+ * 0x196c --> as above but 0x00000011 and 0x00000000
+ *
+ * 0x1988 --> 0xXXNNNNNN
+ * 	- XX == FP high something
+ */
+struct nv50_reg {
+	enum {
+		P_TEMP,
+		P_ATTR,
+		P_RESULT,
+		P_CONST,
+		P_IMMD
+	} type;
+	int index;
+
+	int hw;
+	int neg;
+};
+
+struct nv50_pc {
+	struct nv50_program *p;
+
+	/* hw resources */
+	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
+
+	/* tgsi resources */
+	struct nv50_reg *temp;
+	int temp_nr;
+	struct nv50_reg *attr;
+	int attr_nr;
+	struct nv50_reg *result;
+	int result_nr;
+	struct nv50_reg *param;
+	int param_nr;
+	struct nv50_reg *immd;
+	float *immd_buf;
+	int immd_nr;
+
+	struct nv50_reg *temp_temp[16];
+	unsigned temp_temp_nr;
+};
+
+static void
+alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
+{
+	int i;
+
+	if (reg->type == P_RESULT) {
+		if (pc->p->cfg.high_result < (reg->hw + 1))
+			pc->p->cfg.high_result = reg->hw + 1;
+	}
+
+	if (reg->type != P_TEMP)
+		return;
+
+	if (reg->hw >= 0) {
+		/*XXX: do this here too to catch FP temp-as-attr usage..
+		 *     not clean, but works */
+		if (pc->p->cfg.high_temp < (reg->hw + 1))
+			pc->p->cfg.high_temp = reg->hw + 1;
+		return;
+	}
+
+	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
+		if (!(pc->r_temp[i])) {
+			pc->r_temp[i] = reg;
+			reg->hw = i;
+			if (pc->p->cfg.high_temp < (i + 1))
+				pc->p->cfg.high_temp = i + 1;
+			return;
+		}
+	}
+
+	assert(0);
+}
+
+static struct nv50_reg *
+alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
+{
+	struct nv50_reg *r;
+	int i;
+
+	if (dst && dst->type == P_TEMP && dst->hw == -1)
+		return dst;
+
+	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
+		if (!pc->r_temp[i]) {
+			r = CALLOC_STRUCT(nv50_reg);
+			r->type = P_TEMP;
+			r->index = -1;
+			r->hw = i;
+			pc->r_temp[i] = r;
+			return r;
+		}
+	}
+
+	assert(0);
+	return NULL;
+}
+
+static void
+free_temp(struct nv50_pc *pc, struct nv50_reg *r)
+{
+	if (r->index == -1) {
+		unsigned hw = r->hw;
+
+		FREE(pc->r_temp[hw]);
+		pc->r_temp[hw] = NULL;
+	}
+}
+
+static struct nv50_reg *
+temp_temp(struct nv50_pc *pc)
+{
+	if (pc->temp_temp_nr >= 16)
+		assert(0);
+
+	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
+	return pc->temp_temp[pc->temp_temp_nr++];
+}
+
+static void
+kill_temp_temp(struct nv50_pc *pc)
+{
+	int i;
+	
+	for (i = 0; i < pc->temp_temp_nr; i++)
+		free_temp(pc, pc->temp_temp[i]);
+	pc->temp_temp_nr = 0;
+}
+
+static int
+ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
+{
+	pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * r * sizeof(float)),
+			       (pc->immd_nr + 1) * 4 * sizeof(float));
+	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
+	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
+	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
+	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
+	
+	return pc->immd_nr++;
+}
+
+static struct nv50_reg *
+alloc_immd(struct nv50_pc *pc, float f)
+{
+	struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
+	unsigned hw;
+
+	hw = ctor_immd(pc, f, 0, 0, 0) * 4;
+	r->type = P_IMMD;
+	r->hw = hw;
+	r->index = -1;
+	return r;
+}
+
+static struct nv50_program_exec *
+exec(struct nv50_pc *pc)
+{
+	struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
+
+	e->param.index = -1;
+	return e;
+}
+
+static void
+emit(struct nv50_pc *pc, struct nv50_program_exec *e)
+{
+	struct nv50_program *p = pc->p;
+
+	if (p->exec_tail)
+		p->exec_tail->next = e;
+	if (!p->exec_head)
+		p->exec_head = e;
+	p->exec_tail = e;
+	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
+}
+
+static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
+
+static boolean
+is_long(struct nv50_program_exec *e)
+{
+	if (e->inst[0] & 1)
+		return TRUE;
+	return FALSE;
+}
+
+static boolean
+is_immd(struct nv50_program_exec *e)
+{
+	if (is_long(e) && (e->inst[1] & 3) == 3)
+		return TRUE;
+	return FALSE;
+}
+
+static INLINE void
+set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
+	 struct nv50_program_exec *e)
+{
+	set_long(pc, e);
+	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
+	e->inst[1] |= (pred << 7) | (idx << 12);
+}
+
+static INLINE void
+set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
+	    struct nv50_program_exec *e)
+{
+	set_long(pc, e);
+	e->inst[1] &= ~((0x3 << 4) | (1 << 6));
+	e->inst[1] |= (idx << 4) | (on << 6);
+}
+
+static INLINE void
+set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
+{
+	if (is_long(e))
+		return;
+
+	e->inst[0] |= 1;
+	set_pred(pc, 0xf, 0, e);
+	set_pred_wr(pc, 0, 0, e);
+}
+
+static INLINE void
+set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
+{
+	if (dst->type == P_RESULT) {
+		set_long(pc, e);
+		e->inst[1] |= 0x00000008;
+	}
+
+	alloc_reg(pc, dst);
+	e->inst[0] |= (dst->hw << 2);
+}
+
+static INLINE void
+set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
+{
+	unsigned val = fui(pc->immd_buf[imm->hw]); /* XXX */
+
+	set_long(pc, e);
+	/*XXX: can't be predicated - bits overlap.. catch cases where both
+	 *     are required and avoid them. */
+	set_pred(pc, 0, 0, e);
+	set_pred_wr(pc, 0, 0, e);
+
+	e->inst[1] |= 0x00000002 | 0x00000001;
+	e->inst[0] |= (val & 0x3f) << 16;
+	e->inst[1] |= (val >> 6) << 2;
+}
+
+static void
+emit_interp(struct nv50_pc *pc, struct nv50_reg *dst,
+	    struct nv50_reg *src, struct nv50_reg *iv)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0x80000000;
+	set_dst(pc, dst, e);
+	alloc_reg(pc, src);
+	e->inst[0] |= (src->hw << 16);
+	if (iv) {
+		e->inst[0] |= (1 << 25);
+		alloc_reg(pc, iv);
+		e->inst[0] |= (iv->hw << 9);
+	}
+
+	emit(pc, e);
+}
+
+static void
+set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
+	 struct nv50_program_exec *e)
+{
+	set_long(pc, e);
+#if 1
+	e->inst[1] |= (1 << 22);
+#else
+	if (src->type == P_IMMD) {
+		e->inst[1] |= (NV50_CB_PMISC << 22);
+	} else {
+		if (pc->p->type == PIPE_SHADER_VERTEX)
+			e->inst[1] |= (NV50_CB_PVP << 22);
+		else
+			e->inst[1] |= (NV50_CB_PFP << 22);
+	}
+#endif
+
+	e->param.index = src->hw;
+	e->param.shift = s;
+	e->param.mask = m << (s % 32);
+}
+
+static void
+emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0x10000000;
+
+	set_dst(pc, dst, e);
+
+	if (0 && dst->type != P_RESULT && src->type == P_IMMD) {
+		set_immd(pc, src, e);
+		/*XXX: 32-bit, but steals part of "half" reg space - need to
+		 *     catch and handle this case if/when we do half-regs
+		 */
+		e->inst[0] |= 0x00008000;
+	} else
+	if (src->type == P_IMMD || src->type == P_CONST) {
+		set_long(pc, e);
+		set_data(pc, src, 0x7f, 9, e);
+		e->inst[1] |= 0x20000000; /* src0 const? */
+	} else {
+		if (src->type == P_ATTR) {
+			set_long(pc, e);
+			e->inst[1] |= 0x00200000;
+		}
+
+		alloc_reg(pc, src);
+		e->inst[0] |= (src->hw << 9);
+	}
+
+	/* We really should support "half" instructions here at some point,
+	 * but I don't feel confident enough about them yet.
+	 */
+	set_long(pc, e);
+	if (is_long(e) && !is_immd(e)) {
+		e->inst[1] |= 0x04000000; /* 32-bit */
+		e->inst[1] |= 0x0003c000; /* "subsubop" 0xf == mov */
+	}
+
+	emit(pc, e);
+}
+
+static boolean
+check_swap_src_0_1(struct nv50_pc *pc,
+		   struct nv50_reg **s0, struct nv50_reg **s1)
+{
+	struct nv50_reg *src0 = *s0, *src1 = *s1;
+
+	if (src0->type == P_CONST) {
+		if (src1->type != P_CONST) {
+			*s0 = src1;
+			*s1 = src0;
+			return TRUE;
+		}
+	} else
+	if (src1->type == P_ATTR) {
+		if (src0->type != P_ATTR) {
+			*s0 = src1;
+			*s1 = src0;
+			return TRUE;
+		}
+	}
+
+	return FALSE;
+}
+
+static void
+set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
+{
+	if (src->type == P_ATTR) {
+		set_long(pc, e);
+		e->inst[1] |= 0x00200000;
+	} else
+	if (src->type == P_CONST || src->type == P_IMMD) {
+		struct nv50_reg *temp = temp_temp(pc);
+
+		emit_mov(pc, temp, src);
+		src = temp;
+	}
+
+	alloc_reg(pc, src);
+	e->inst[0] |= (src->hw << 9);
+}
+
+static void
+set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
+{
+	if (src->type == P_ATTR) {
+		struct nv50_reg *temp = temp_temp(pc);
+
+		emit_mov(pc, temp, src);
+		src = temp;
+	} else
+	if (src->type == P_CONST || src->type == P_IMMD) {
+		assert(!(e->inst[0] & 0x00800000));
+		if (e->inst[0] & 0x01000000) {
+			struct nv50_reg *temp = temp_temp(pc);
+
+			emit_mov(pc, temp, src);
+			src = temp;
+		} else {
+			set_data(pc, src, 0x7f, 16, e);
+			e->inst[0] |= 0x00800000;
+		}
+	}
+
+	alloc_reg(pc, src);
+	e->inst[0] |= (src->hw << 16);
+}
+
+static void
+set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
+{
+	set_long(pc, e);
+
+	if (src->type == P_ATTR) {
+		struct nv50_reg *temp = temp_temp(pc);
+
+		emit_mov(pc, temp, src);
+		src = temp;
+	} else
+	if (src->type == P_CONST || src->type == P_IMMD) {
+		assert(!(e->inst[0] & 0x01000000));
+		if (e->inst[0] & 0x00800000) {
+			struct nv50_reg *temp = temp_temp(pc);
+
+			emit_mov(pc, temp, src);
+			src = temp;
+		} else {
+			set_data(pc, src, 0x7f, 32+14, e);
+			e->inst[0] |= 0x01000000;
+		}
+	}
+
+	alloc_reg(pc, src);
+	e->inst[1] |= (src->hw << 14);
+}
+
+static void
+emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
+	 struct nv50_reg *src1)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xc0000000;
+	set_long(pc, e);
+
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
+	 struct nv50_reg *src0, struct nv50_reg *src1)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xb0000000;
+
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	if (is_long(e))
+		set_src_2(pc, src1, e);
+	else
+		set_src_1(pc, src1, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
+	    struct nv50_reg *src0, struct nv50_reg *src1)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	set_long(pc, e);
+	e->inst[0] |= 0xb0000000;
+	e->inst[1] |= (sub << 29);
+
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
+	 struct nv50_reg *src1)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xb0000000;
+
+	set_long(pc, e);
+	if (check_swap_src_0_1(pc, &src0, &src1))
+		e->inst[1] |= 0x04000000;
+	else
+		e->inst[1] |= 0x08000000;
+
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_2(pc, src1, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
+	 struct nv50_reg *src1, struct nv50_reg *src2)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xe0000000;
+
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+	set_src_2(pc, src2, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
+	 struct nv50_reg *src1, struct nv50_reg *src2)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xe0000000;
+	set_long(pc, e);
+	e->inst[1] |= 0x08000000; /* src0 * src1 - src2 */
+
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+	set_src_2(pc, src2, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_flop(struct nv50_pc *pc, unsigned sub,
+	  struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0x90000000;
+	if (sub) {
+		set_long(pc, e);
+		e->inst[1] |= (sub << 29);
+	}
+
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xb0000000;
+
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+	set_long(pc, e);
+	e->inst[1] |= (6 << 29) | 0x00004000;
+
+	emit(pc, e);
+}
+
+static void
+emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xb0000000;
+
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+	set_long(pc, e);
+	e->inst[1] |= (6 << 29);
+
+	emit(pc, e);
+}
+
+static void
+emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
+	 struct nv50_reg *src0, struct nv50_reg *src1)
+{
+	struct nv50_program_exec *e = exec(pc);
+	unsigned inv_cop[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
+	struct nv50_reg *rdst;
+
+	assert(c_op <= 7);
+	if (check_swap_src_0_1(pc, &src0, &src1))
+		c_op = inv_cop[c_op];
+
+	rdst = dst;
+	if (dst->type != P_TEMP)
+		dst = alloc_temp(pc, NULL);
+
+	/* set.u32 */
+	set_long(pc, e);
+	e->inst[0] |= 0xb0000000;
+	e->inst[1] |= (3 << 29);
+	e->inst[1] |= (c_op << 14);
+	/*XXX: breaks things, .u32 by default?
+	 *     decuda will disasm as .u16 and use .lo/.hi regs, but this
+	 *     doesn't seem to match what the hw actually does.
+	inst[1] |= 0x04000000; << breaks things.. .u32 by default?
+	 */
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+	emit(pc, e);
+
+	/* cvt.f32.u32 */
+	e = exec(pc);
+	e->inst[0] = 0xa0000001;
+	e->inst[1] = 0x64014780;
+	set_dst(pc, rdst, e);
+	set_src_0(pc, dst, e);
+	emit(pc, e);
+
+	if (dst != rdst)
+		free_temp(pc, dst);
+}
+
+static void
+emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xa0000000; /* cvt */
+	set_long(pc, e);
+	e->inst[1] |= (6 << 29); /* cvt */
+	e->inst[1] |= 0x08000000; /* integer mode */
+	e->inst[1] |= 0x04000000; /* 32 bit */
+	e->inst[1] |= ((0x1 << 3)) << 14; /* .rn */
+	e->inst[1] |= (1 << 14); /* src .f32 */
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
+	 struct nv50_reg *v, struct nv50_reg *e)
+{
+	struct nv50_reg *temp = alloc_temp(pc, NULL);
+
+	emit_flop(pc, 3, temp, v);
+	emit_mul(pc, temp, temp, e);
+	emit_preex2(pc, temp, temp);
+	emit_flop(pc, 6, dst, temp);
+
+	free_temp(pc, temp);
+}
+
+static void
+emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xa0000000; /* cvt */
+	set_long(pc, e);
+	e->inst[1] |= (6 << 29); /* cvt */
+	e->inst[1] |= 0x04000000; /* 32 bit */
+	e->inst[1] |= (1 << 14); /* src .f32 */
+	e->inst[1] |= ((1 << 6) << 14); /* .abs */
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
+	 struct nv50_reg **src)
+{
+	struct nv50_reg *one = alloc_immd(pc, 1.0);
+	struct nv50_reg *zero = alloc_immd(pc, 0.0);
+	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
+	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
+	struct nv50_reg *tmp[4];
+
+	if (mask & (1 << 0))
+		emit_mov(pc, dst[0], one);
+
+	if (mask & (1 << 3))
+		emit_mov(pc, dst[3], one);
+
+	if (mask & (3 << 1)) {
+		if (mask & (1 << 1))
+			tmp[0] = dst[1];
+		else
+			tmp[0] = temp_temp(pc);
+		emit_minmax(pc, 4, tmp[0], src[0], zero);
+	}
+
+	if (mask & (1 << 2)) {
+		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
+
+		tmp[1] = temp_temp(pc);
+		emit_minmax(pc, 4, tmp[1], src[1], zero);
+
+		tmp[3] = temp_temp(pc);
+		emit_minmax(pc, 4, tmp[3], src[3], neg128);
+		emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
+
+		emit_pow(pc, dst[2], tmp[1], tmp[3]);
+		emit_mov(pc, dst[2], zero);
+		set_pred(pc, 3, 0, pc->p->exec_tail);
+	}
+}
+
+static void
+emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	set_long(pc, e);
+	e->inst[0] |= 0xa0000000; /* delta */
+	e->inst[1] |= (7 << 29); /* delta */
+	e->inst[1] |= 0x04000000; /* negate arg0? probably not */
+	e->inst[1] |= (1 << 14); /* src .f32 */
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+
+	emit(pc, e);
+}
+
+static struct nv50_reg *
+tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
+{
+	switch (dst->DstRegister.File) {
+	case TGSI_FILE_TEMPORARY:
+		return &pc->temp[dst->DstRegister.Index * 4 + c];
+	case TGSI_FILE_OUTPUT:
+		return &pc->result[dst->DstRegister.Index * 4 + c];
+	case TGSI_FILE_NULL:
+		return NULL;
+	default:
+		break;
+	}
+
+	return NULL;
+}
+
+static struct nv50_reg *
+tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
+{
+	struct nv50_reg *r = NULL;
+	struct nv50_reg *temp;
+	unsigned c;
+
+	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
+	switch (c) {
+	case TGSI_EXTSWIZZLE_X:
+	case TGSI_EXTSWIZZLE_Y:
+	case TGSI_EXTSWIZZLE_Z:
+	case TGSI_EXTSWIZZLE_W:
+		switch (src->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			r = &pc->attr[src->SrcRegister.Index * 4 + c];
+			break;
+		case TGSI_FILE_TEMPORARY:
+			r = &pc->temp[src->SrcRegister.Index * 4 + c];
+			break;
+		case TGSI_FILE_CONSTANT:
+			r = &pc->param[src->SrcRegister.Index * 4 + c];
+			break;
+		case TGSI_FILE_IMMEDIATE:
+			r = &pc->immd[src->SrcRegister.Index * 4 + c];
+			break;
+		case TGSI_FILE_SAMPLER:
+			break;
+		default:
+			assert(0);
+			break;
+		}
+		break;
+	case TGSI_EXTSWIZZLE_ZERO:
+		r = alloc_immd(pc, 0.0);
+		break;
+	case TGSI_EXTSWIZZLE_ONE:
+		r = alloc_immd(pc, 1.0);
+		break;
+	default:
+		assert(0);
+		break;
+	}
+
+	switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) {
+	case TGSI_UTIL_SIGN_KEEP:
+		break;
+	case TGSI_UTIL_SIGN_CLEAR:
+		temp = temp_temp(pc);
+		emit_abs(pc, temp, r);
+		r = temp;
+		break;
+	case TGSI_UTIL_SIGN_TOGGLE:
+		temp = temp_temp(pc);
+		emit_neg(pc, temp, r);
+		r = temp;
+		break;
+	case TGSI_UTIL_SIGN_SET:
+		temp = temp_temp(pc);
+		emit_abs(pc, temp, r);
+		emit_neg(pc, temp, r);
+		r = temp;
+		break;
+	default:
+		assert(0);
+		break;
+	}
+
+	return r;
+}
+
+static boolean
+nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
+{
+	const struct tgsi_full_instruction *inst = &tok->FullInstruction;
+	struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
+	unsigned mask, sat;
+	int i, c;
+
+	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
+
+	for (c = 0; c < 4; c++) {
+		if (mask & (1 << c))
+			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
+		else
+			dst[c] = NULL;
+	}
+
+	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+		for (c = 0; c < 4; c++)
+			src[i][c] = tgsi_src(pc, c, &inst->FullSrcRegisters[i]);
+	}
+
+	if (sat) {
+		for (c = 0; c < 4; c++) {
+			rdst[c] = dst[c];
+			dst[c] = temp_temp(pc);
+		}
+	}
+
+	switch (inst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_abs(pc, dst[c], src[0][c]);
+		}
+		break;
+	case TGSI_OPCODE_ADD:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_add(pc, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_COS:
+		temp = alloc_temp(pc, NULL);
+		emit_precossin(pc, temp, src[0][0]);
+		emit_flop(pc, 5, temp, temp);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		break;
+	case TGSI_OPCODE_DP3:
+		temp = alloc_temp(pc, NULL);
+		emit_mul(pc, temp, src[0][0], src[1][0]);
+		emit_mad(pc, temp, src[0][1], src[1][1], temp);
+		emit_mad(pc, temp, src[0][2], src[1][2], temp);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_DP4:
+		temp = alloc_temp(pc, NULL);
+		emit_mul(pc, temp, src[0][0], src[1][0]);
+		emit_mad(pc, temp, src[0][1], src[1][1], temp);
+		emit_mad(pc, temp, src[0][2], src[1][2], temp);
+		emit_mad(pc, temp, src[0][3], src[1][3], temp);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_DPH:
+		temp = alloc_temp(pc, NULL);
+		emit_mul(pc, temp, src[0][0], src[1][0]);
+		emit_mad(pc, temp, src[0][1], src[1][1], temp);
+		emit_mad(pc, temp, src[0][2], src[1][2], temp);
+		emit_add(pc, temp, src[1][3], temp);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_DST:
+	{
+		struct nv50_reg *one = alloc_immd(pc, 1.0);
+		if (mask & (1 << 0))
+			emit_mov(pc, dst[0], one);
+		if (mask & (1 << 1))
+			emit_mul(pc, dst[1], src[0][1], src[1][1]);
+		if (mask & (1 << 2))
+			emit_mov(pc, dst[2], src[0][2]);
+		if (mask & (1 << 3))
+			emit_mov(pc, dst[3], src[1][3]);
+		FREE(one);
+	}
+		break;
+	case TGSI_OPCODE_EX2:
+		temp = alloc_temp(pc, NULL);
+		emit_preex2(pc, temp, src[0][0]);
+		emit_flop(pc, 6, temp, temp);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_FLR:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_flr(pc, dst[c], src[0][c]);
+		}
+		break;
+	case TGSI_OPCODE_FRC:
+		temp = alloc_temp(pc, NULL);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_flr(pc, temp, src[0][c]);
+			emit_sub(pc, dst[c], src[0][c], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_LIT:
+		emit_lit(pc, &dst[0], mask, &src[0][0]);
+		break;
+	case TGSI_OPCODE_LG2:
+		temp = alloc_temp(pc, NULL);
+		emit_flop(pc, 3, temp, src[0][0]);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		break;
+	case TGSI_OPCODE_LRP:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			/*XXX: we can do better than this */
+			temp = alloc_temp(pc, NULL);
+			emit_neg(pc, temp, src[0][c]);
+			emit_mad(pc, temp, temp, src[2][c], src[2][c]);
+			emit_mad(pc, dst[c], src[0][c], src[1][c], temp);
+			free_temp(pc, temp);
+		}
+		break;
+	case TGSI_OPCODE_MAD:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
+		}
+		break;
+	case TGSI_OPCODE_MAX:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_MIN:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_MOV:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], src[0][c]);
+		}
+		break;
+	case TGSI_OPCODE_MUL:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mul(pc, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_POW:
+		temp = alloc_temp(pc, NULL);
+		emit_pow(pc, temp, src[0][0], src[1][0]);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_RCP:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_flop(pc, 0, dst[c], src[0][0]);
+		}
+		break;
+	case TGSI_OPCODE_RSQ:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_flop(pc, 2, dst[c], src[0][0]);
+		}
+		break;
+	case TGSI_OPCODE_SCS:
+		temp = alloc_temp(pc, NULL);
+		emit_precossin(pc, temp, src[0][0]);
+		if (mask & (1 << 0))
+			emit_flop(pc, 5, dst[0], temp);
+		if (mask & (1 << 1))
+			emit_flop(pc, 4, dst[1], temp);
+		break;
+	case TGSI_OPCODE_SGE:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_SIN:
+		temp = alloc_temp(pc, NULL);
+		emit_precossin(pc, temp, src[0][0]);
+		emit_flop(pc, 4, temp, temp);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		break;
+	case TGSI_OPCODE_SLT:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_SUB:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_sub(pc, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_TEX:
+		{
+			struct nv50_reg *t0, *t1, *t2, *t3;
+			struct nv50_program_exec *e;
+
+			t0 = alloc_temp(pc, NULL);
+			t0 = alloc_temp(pc, NULL);
+			t1 = alloc_temp(pc, NULL);
+			t2 = alloc_temp(pc, NULL);
+			t3 = alloc_temp(pc, NULL);
+			emit_mov(pc, t0, src[0][0]);
+			emit_mov(pc, t1, src[0][1]);
+
+			e = exec(pc);
+			e->inst[0] = 0xf6400000;
+			set_long(pc, e);
+			e->inst[1] |= 0x0000c004;
+			set_dst(pc, t0, e);
+			emit(pc, e);
+
+			if (mask & (1 << 0)) emit_mov(pc, dst[0], t0);
+			if (mask & (1 << 1)) emit_mov(pc, dst[1], t1);
+			if (mask & (1 << 2)) emit_mov(pc, dst[2], t2);
+			if (mask & (1 << 3)) emit_mov(pc, dst[3], t3);
+
+			free_temp(pc, t0);
+			free_temp(pc, t1);
+			free_temp(pc, t2);
+			free_temp(pc, t3);
+		}
+		break;
+	case TGSI_OPCODE_XPD:
+		temp = alloc_temp(pc, NULL);
+		if (mask & (1 << 0)) {
+			emit_mul(pc, temp, src[0][2], src[1][1]);
+			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
+		}
+		if (mask & (1 << 1)) {
+			emit_mul(pc, temp, src[0][0], src[1][2]);
+			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
+		}
+		if (mask & (1 << 2)) {
+			emit_mul(pc, temp, src[0][1], src[1][0]);
+			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_END:
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	if (sat) {
+		for (c = 0; c < 4; c++) {
+			struct nv50_program_exec *e;
+
+			if (!(mask & (1 << c)))
+				continue;
+			e = exec(pc);
+
+			e->inst[0] = 0xa0000000; /* cvt */
+			set_long(pc, e);
+			e->inst[1] |= (6 << 29); /* cvt */
+			e->inst[1] |= 0x04000000; /* 32 bit */
+			e->inst[1] |= (1 << 14); /* src .f32 */
+			e->inst[1] |= ((1 << 5) << 14); /* .sat */
+			set_dst(pc, rdst[c], e);
+			set_src_0(pc, dst[c], e);
+			emit(pc, e);
+		}
+	}
+
+	kill_temp_temp(pc);
+	return TRUE;
+}
+
+static boolean
+nv50_program_tx_prep(struct nv50_pc *pc)
+{
+	struct tgsi_parse_context p;
+	boolean ret = FALSE;
+	unsigned i, c;
+
+	tgsi_parse_init(&p, pc->p->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch (tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			const struct tgsi_full_immediate *imm =
+				&p.FullToken.FullImmediate;
+
+			ctor_immd(pc, imm->u.ImmediateFloat32[0].Float,
+				      imm->u.ImmediateFloat32[1].Float,
+				      imm->u.ImmediateFloat32[2].Float,
+				      imm->u.ImmediateFloat32[3].Float);
+		}
+			break;
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *d;
+			unsigned last;
+
+			d = &p.FullToken.FullDeclaration;
+			last = d->DeclarationRange.Last;
+
+			switch (d->Declaration.File) {
+			case TGSI_FILE_TEMPORARY:
+				if (pc->temp_nr < (last + 1))
+					pc->temp_nr = last + 1;
+				break;
+			case TGSI_FILE_OUTPUT:
+				if (pc->result_nr < (last + 1))
+					pc->result_nr = last + 1;
+				break;
+			case TGSI_FILE_INPUT:
+				if (pc->attr_nr < (last + 1))
+					pc->attr_nr = last + 1;
+				break;
+			case TGSI_FILE_CONSTANT:
+				if (pc->param_nr < (last + 1))
+					pc->param_nr = last + 1;
+				break;
+			case TGSI_FILE_SAMPLER:
+				break;
+			default:
+				NOUVEAU_ERR("bad decl file %d\n",
+					    d->Declaration.File);
+				goto out_err;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (pc->temp_nr) {
+		pc->temp = CALLOC(pc->temp_nr * 4, sizeof(struct nv50_reg));
+		if (!pc->temp)
+			goto out_err;
+
+		for (i = 0; i < pc->temp_nr; i++) {
+			for (c = 0; c < 4; c++) {
+				pc->temp[i*4+c].type = P_TEMP;
+				pc->temp[i*4+c].hw = -1;
+				pc->temp[i*4+c].index = i;
+			}
+		}
+	}
+
+	if (pc->attr_nr) {
+		struct nv50_reg *iv = NULL;
+		int aid = 0;
+
+		pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg));
+		if (!pc->attr)
+			goto out_err;
+
+		if (pc->p->type == PIPE_SHADER_FRAGMENT) {
+			iv = alloc_temp(pc, NULL);
+			emit_interp(pc, iv, iv, NULL);
+			emit_flop(pc, 0, iv, iv);
+			aid++;
+		}
+
+		for (i = 0; i < pc->attr_nr; i++) {
+			struct nv50_reg *a = &pc->attr[i*4];
+
+			for (c = 0; c < 4; c++) {
+				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
+					struct nv50_reg *at =
+						alloc_temp(pc, NULL);
+					pc->attr[i*4+c].type = at->type;
+					pc->attr[i*4+c].hw = at->hw;
+					pc->attr[i*4+c].index = at->index;
+				} else {
+					pc->p->cfg.vp.attr[aid/32] |=
+						(1 << (aid % 32));
+					pc->attr[i*4+c].type = P_ATTR;
+					pc->attr[i*4+c].hw = aid++;
+					pc->attr[i*4+c].index = i;
+				}
+			}
+
+			if (pc->p->type != PIPE_SHADER_FRAGMENT)
+				continue;
+
+			emit_interp(pc, &a[0], &a[0], iv);
+			emit_interp(pc, &a[1], &a[1], iv);
+			emit_interp(pc, &a[2], &a[2], iv);
+			emit_interp(pc, &a[3], &a[3], iv);
+		}
+
+		if (iv)
+			free_temp(pc, iv);
+	}
+
+	if (pc->result_nr) {
+		int rid = 0;
+
+		pc->result = CALLOC(pc->result_nr * 4, sizeof(struct nv50_reg));
+		if (!pc->result)
+			goto out_err;
+
+		for (i = 0; i < pc->result_nr; i++) {
+			for (c = 0; c < 4; c++) {
+				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
+					pc->result[i*4+c].type = P_TEMP;
+					pc->result[i*4+c].hw = -1;
+				} else {
+					pc->result[i*4+c].type = P_RESULT;
+					pc->result[i*4+c].hw = rid++;
+				}
+				pc->result[i*4+c].index = i;
+			}
+		}
+	}
+
+	if (pc->param_nr) {
+		int rid = 0;
+
+		pc->param = CALLOC(pc->param_nr * 4, sizeof(struct nv50_reg));
+		if (!pc->param)
+			goto out_err;
+
+		for (i = 0; i < pc->param_nr; i++) {
+			for (c = 0; c < 4; c++) {
+				pc->param[i*4+c].type = P_CONST;
+				pc->param[i*4+c].hw = rid++;
+				pc->param[i*4+c].index = i;
+			}
+		}
+	}
+
+	if (pc->immd_nr) {
+		int rid = pc->param_nr * 4;
+
+		pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg));
+		if (!pc->immd)
+			goto out_err;
+
+		for (i = 0; i < pc->immd_nr; i++) {
+			for (c = 0; c < 4; c++) {
+				pc->immd[i*4+c].type = P_IMMD;
+				pc->immd[i*4+c].hw = rid++;
+				pc->immd[i*4+c].index = i;
+			}
+		}
+	}
+
+	ret = TRUE;
+out_err:
+	tgsi_parse_free(&p);
+	return ret;
+}
+
+static boolean
+nv50_program_tx(struct nv50_program *p)
+{
+	struct tgsi_parse_context parse;
+	struct nv50_pc *pc;
+	boolean ret;
+
+	pc = CALLOC_STRUCT(nv50_pc);
+	if (!pc)
+		return FALSE;
+	pc->p = p;
+	pc->p->cfg.high_temp = 4;
+
+	ret = nv50_program_tx_prep(pc);
+	if (ret == FALSE)
+		goto out_cleanup;
+
+	tgsi_parse_init(&parse, pc->p->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		const union tgsi_full_token *tok = &parse.FullToken;
+
+		tgsi_parse_token(&parse);
+
+		switch (tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+			ret = nv50_program_tx_insn(pc, tok);
+			if (ret == FALSE)
+				goto out_err;
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (p->type == PIPE_SHADER_FRAGMENT) {
+		struct nv50_reg out;
+
+		out.type = P_TEMP;
+		for (out.hw = 0; out.hw < pc->result_nr * 4; out.hw++)
+			emit_mov(pc, &out, &pc->result[out.hw]);
+	}
+
+	assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
+	pc->p->exec_tail->inst[1] |= 0x00000001;
+
+	p->param_nr = pc->param_nr * 4;
+	p->immd_nr = pc->immd_nr * 4;
+	p->immd = pc->immd_buf;
+
+out_err:
+	tgsi_parse_free(&parse);
+
+out_cleanup:
+	return ret;
+}
+
+static void
+nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
+{
+	if (nv50_program_tx(p) == FALSE)
+		assert(0);
+	p->translated = TRUE;
+}
+
+static void
+nv50_program_upload_data(struct nv50_context *nv50, float *map,
+			 unsigned start, unsigned count)
+{
+	while (count) {
+		unsigned nr = count > 2047 ? 2047 : count;
+
+		BEGIN_RING(tesla, 0x00000f00, 1);
+		OUT_RING  ((NV50_CB_PMISC << 0) | (start << 8));
+		BEGIN_RING(tesla, 0x40000f04, nr);
+		OUT_RINGp (map, nr);
+
+		map += nr;
+		start += nr;
+		count -= nr;
+	}
+}
+
+static void
+nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
+{
+	struct nouveau_winsys *nvws = nv50->screen->nvws;
+	struct pipe_winsys *ws = nv50->pipe.winsys;
+	unsigned nr = p->param_nr + p->immd_nr;
+
+	if (!p->data && nr) {
+		struct nouveau_resource *heap = nv50->screen->vp_data_heap;
+
+		if (nvws->res_alloc(heap, nr, p, &p->data)) {
+			while (heap->next && heap->size < nr) {
+				struct nv50_program *evict = heap->next->priv;
+				nvws->res_free(&evict->data);
+			}
+
+			if (nvws->res_alloc(heap, nr, p, &p->data))
+				assert(0);
+		}
+	}
+
+	if (p->param_nr) {
+		float *map = ws->buffer_map(ws, nv50->constbuf[p->type],
+					    PIPE_BUFFER_USAGE_CPU_READ);
+		nv50_program_upload_data(nv50, map, p->data->start,
+					 p->param_nr);
+		ws->buffer_unmap(ws, nv50->constbuf[p->type]);
+	}
+
+	if (p->immd_nr) {
+		nv50_program_upload_data(nv50, p->immd,
+					 p->data->start + p->param_nr,
+					 p->immd_nr);
+	}
+}
+
+static void
+nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
+{
+	struct pipe_winsys *ws = nv50->pipe.winsys;
+	struct nv50_program_exec *e;
+	struct nouveau_stateobj *so;
+	const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
+	unsigned start, count, *up, *ptr;
+	boolean upload = FALSE;
+
+	if (!p->buffer) {
+		p->buffer = ws->buffer_create(ws, 0x100, 0, p->exec_size * 4);
+		upload = TRUE;
+	}
+
+	if (p->data && p->data->start != p->data_start) {
+		for (e = p->exec_head; e; e = e->next) {
+			unsigned ei, ci;
+
+			if (e->param.index < 0)
+				continue;
+			ei = e->param.shift >> 5;
+			ci = e->param.index + p->data->start;
+
+			e->inst[ei] &= ~e->param.mask;
+			e->inst[ei] |= (ci << e->param.shift);
+		}
+
+		p->data_start = p->data->start;
+		upload = TRUE;
+	}
+
+	if (!upload)
+		return;
+
+	up = ptr = MALLOC(p->exec_size * 4);
+	for (e = p->exec_head; e; e = e->next) {
+		*(ptr++) = e->inst[0];
+		if (is_long(e))
+			*(ptr++) = e->inst[1];
+	}
+
+	so = so_new(4,2);
+	so_method(so, nv50->screen->tesla, 0x1280, 3);
+	so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
+
+	start = 0; count = p->exec_size;
+	while (count) {
+		struct nouveau_winsys *nvws = nv50->screen->nvws;
+		unsigned nr;
+
+		so_emit(nvws, so);
+
+		nr = MIN2(count, 2047);
+		nr = MIN2(nvws->channel->pushbuf->remaining, nr);
+		if (nvws->channel->pushbuf->remaining < (nr + 3)) {
+			FIRE_RING(NULL);
+			continue;
+		}
+
+		BEGIN_RING(tesla, 0x0f00, 1);
+		OUT_RING  ((start << 8) | NV50_CB_PUPLOAD);
+		BEGIN_RING(tesla, 0x40000f04, nr);	
+		OUT_RINGp (up + start, nr);
+
+		start += nr;
+		count -= nr;
+	}
+
+	FREE(up);
+	so_ref(NULL, &so);
+}
+
+void
+nv50_vertprog_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_program *p = nv50->vertprog;
+	struct nouveau_stateobj *so;
+
+	if (!p->translated) {
+		nv50_program_validate(nv50, p);
+		if (!p->translated)
+			assert(0);
+	}
+
+	nv50_program_validate_data(nv50, p);
+	nv50_program_validate_code(nv50, p);
+
+	so = so_new(13, 2);
+	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
+	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		  NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		  NOUVEAU_BO_LOW, 0, 0);
+	so_method(so, tesla, 0x1650, 2);
+	so_data  (so, p->cfg.vp.attr[0]);
+	so_data  (so, p->cfg.vp.attr[1]);
+	so_method(so, tesla, 0x16b8, 1);
+	so_data  (so, p->cfg.high_result);
+	so_method(so, tesla, 0x16ac, 2);
+	so_data  (so, p->cfg.high_result); //8);
+	so_data  (so, p->cfg.high_temp);
+	so_method(so, tesla, 0x140c, 1);
+	so_data  (so, 0); /* program start offset */
+	so_ref(so, &nv50->state.vertprog);
+}
+
+void
+nv50_fragprog_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_program *p = nv50->fragprog;
+	struct nouveau_stateobj *so;
+
+	if (!p->translated) {
+		nv50_program_validate(nv50, p);
+		if (!p->translated)
+			assert(0);
+	}
+
+	nv50_program_validate_data(nv50, p);
+	nv50_program_validate_code(nv50, p);
+
+	so = so_new(64, 2);
+	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
+	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		  NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		  NOUVEAU_BO_LOW, 0, 0);
+	so_method(so, tesla, 0x1904, 4);
+	so_data  (so, 0x01040404); /* p: 0x01000404 */
+	so_data  (so, 0x00000004);
+	so_data  (so, 0x00000000);
+	so_data  (so, 0x00000000);
+	so_method(so, tesla, 0x16bc, 3); /*XXX: fixme */
+	so_data  (so, 0x03020100);
+	so_data  (so, 0x07060504);
+	so_data  (so, 0x0b0a0908);
+	so_method(so, tesla, 0x1988, 2);
+	so_data  (so, 0x08080408); //0x08040404); /* p: 0x0f000401 */
+	so_data  (so, p->cfg.high_temp);
+	so_method(so, tesla, 0x1414, 1);
+	so_data  (so, 0); /* program start offset */
+	so_ref(so, &nv50->state.fragprog);
+}
+
+void
+nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
+{
+	struct pipe_winsys *ws = nv50->pipe.winsys;
+
+	while (p->exec_head) {
+		struct nv50_program_exec *e = p->exec_head;
+
+		p->exec_head = e->next;
+		FREE(e);
+	}
+	p->exec_tail = NULL;
+	p->exec_size = 0;
+
+	if (p->buffer)
+		pipe_buffer_reference(ws, &p->buffer, NULL);
+
+	nv50->screen->nvws->res_free(&p->data);
+
+	p->translated = 0;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
new file mode 100644
index 00000000000..78deed6a384
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -0,0 +1,45 @@
+#ifndef __NV50_PROGRAM_H__
+#define __NV50_PROGRAM_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nv50_program_exec {
+	struct nv50_program_exec *next;
+
+	unsigned inst[2];
+	struct {
+		int index;
+		unsigned mask;
+		unsigned shift;
+	} param;
+};
+
+struct nv50_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+	boolean translated;
+
+	unsigned type;
+	struct nv50_program_exec *exec_head;
+	struct nv50_program_exec *exec_tail;
+	unsigned exec_size;
+	struct nouveau_resource *data;
+	unsigned data_start;
+
+	struct pipe_buffer *buffer;
+
+	float *immd;
+	unsigned immd_nr;
+	unsigned param_nr;
+
+	struct {
+		unsigned high_temp;
+		unsigned high_result;
+		struct {
+			unsigned attr[2];
+		} vp;
+	} cfg;
+};
+
+#endif
diff --git a/src/gallium/drivers/nv50/nv50_query.c b/src/gallium/drivers/nv50/nv50_query.c
new file mode 100644
index 00000000000..26bd90ccc5e
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_query.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_context.h"
+
+#include "nv50_context.h"
+
+static struct pipe_query *
+nv50_query_create(struct pipe_context *pipe, unsigned type)
+{
+	NOUVEAU_ERR("unimplemented\n");
+	return NULL;
+}
+
+static void
+nv50_query_destroy(struct pipe_context *pipe, struct pipe_query *q)
+{
+	NOUVEAU_ERR("unimplemented\n");
+}
+
+static void
+nv50_query_begin(struct pipe_context *pipe, struct pipe_query *q)
+{
+	NOUVEAU_ERR("unimplemented\n");
+}
+
+static void
+nv50_query_end(struct pipe_context *pipe, struct pipe_query *q)
+{
+	NOUVEAU_ERR("unimplemented\n");
+}
+
+static boolean
+nv50_query_result(struct pipe_context *pipe, struct pipe_query *q,
+		  boolean wait, uint64 *result)
+{
+	NOUVEAU_ERR("unimplemented\n");
+	*result = 0xdeadcafe;
+	return TRUE;
+}
+
+void
+nv50_init_query_functions(struct nv50_context *nv50)
+{
+	nv50->pipe.create_query = nv50_query_create;
+	nv50->pipe.destroy_query = nv50_query_destroy;
+	nv50->pipe.begin_query = nv50_query_begin;
+	nv50->pipe.end_query = nv50_query_end;
+	nv50->pipe.get_query_result = nv50_query_result;
+}
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
new file mode 100644
index 00000000000..b5aef7dadd8
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -0,0 +1,323 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_screen.h"
+
+#include "nv50_context.h"
+#include "nv50_screen.h"
+
+#include "nouveau/nouveau_stateobj.h"
+
+#define NV5X_GRCLASS5097_CHIPSETS 0x00000001
+#define NV8X_GRCLASS8297_CHIPSETS 0x00000050
+#define NV9X_GRCLASS8297_CHIPSETS 0x00000014
+
+static boolean
+nv50_screen_is_format_supported(struct pipe_screen *pscreen,
+				enum pipe_format format,
+				enum pipe_texture_target target,
+				unsigned tex_usage, unsigned geom_flags)
+{
+	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM:
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_A1R5G5B5_UNORM:
+		case PIPE_FORMAT_A4R4G4B4_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM:
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+		case PIPE_FORMAT_A8L8_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static const char *
+nv50_screen_get_name(struct pipe_screen *pscreen)
+{
+	struct nv50_screen *screen = nv50_screen(pscreen);
+	struct nouveau_device *dev = screen->nvws->channel->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nv50_screen_get_vendor(struct pipe_screen *pscreen)
+{
+	return "nouveau";
+}
+
+static int
+nv50_screen_get_param(struct pipe_screen *pscreen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 32;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 0;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 1;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 0;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 0;
+	case PIPE_CAP_POINT_SPRITE:
+		return 0;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 8;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 0;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 13;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 10;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 13;
+	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+	case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+		return 1;
+	case NOUVEAU_CAP_HW_VTXBUF:	
+		return 1;
+	case NOUVEAU_CAP_HW_IDXBUF:	
+		return 0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv50_screen_get_paramf(struct pipe_screen *pscreen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 10.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 64.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 16.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 4.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static void
+nv50_screen_destroy(struct pipe_screen *pscreen)
+{
+	FREE(pscreen);
+}
+
+struct pipe_screen *
+nv50_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+{
+	struct nv50_screen *screen = CALLOC_STRUCT(nv50_screen);
+	struct nouveau_stateobj *so;
+	unsigned tesla_class = 0, ret;
+	unsigned chipset = nvws->channel->device->chipset;
+	int i;
+
+	if (!screen)
+		return NULL;
+	screen->nvws = nvws;
+
+	/* 3D object */
+	if ((chipset & 0xf0) != 0x50 && (chipset & 0xf0) != 0x80) {
+		NOUVEAU_ERR("Not a G8x chipset\n");
+		nv50_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	switch (chipset & 0xf0) {
+	case 0x50:
+		if (NV5X_GRCLASS5097_CHIPSETS & (1 << (chipset & 0x0f)))
+			tesla_class = 0x5097;
+		break;
+	case 0x80:
+		if (NV8X_GRCLASS8297_CHIPSETS & (1 << (chipset & 0x0f)))
+			tesla_class = 0x8297;
+		break;
+	case 0x90:
+		if (NV9X_GRCLASS8297_CHIPSETS & (1 << (chipset & 0x0f)))
+			tesla_class = 0x8297;
+		break;
+	default:
+		break;
+	}
+
+	if (tesla_class == 0) {
+		NOUVEAU_ERR("Unknown G8x chipset: NV%02x\n", chipset);
+		nv50_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	ret = nvws->grobj_alloc(nvws, tesla_class, &screen->tesla);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		nv50_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Sync notifier */
+	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv50_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Static tesla init */
+	so = so_new(256, 20);
+
+	so_method(so, screen->tesla, 0x1558, 1);
+	so_data  (so, 1);
+	so_method(so, screen->tesla, NV50TCL_DMA_NOTIFY, 1);
+	so_data  (so, screen->sync->handle);
+	so_method(so, screen->tesla, NV50TCL_DMA_UNK0(0),
+				     NV50TCL_DMA_UNK0__SIZE);
+	for (i = 0; i < NV50TCL_DMA_UNK0__SIZE; i++)
+		so_data(so, nvws->channel->vram->handle);
+	so_method(so, screen->tesla, NV50TCL_DMA_UNK1(0),
+				     NV50TCL_DMA_UNK1__SIZE);
+	for (i = 0; i < NV50TCL_DMA_UNK1__SIZE; i++)
+		so_data(so, nvws->channel->vram->handle);
+	so_method(so, screen->tesla, 0x121c, 1);
+	so_data  (so, 1);
+
+	so_method(so, screen->tesla, 0x13bc, 1);
+	so_data  (so, 0x54);
+	so_method(so, screen->tesla, 0x13ac, 1);
+	so_data  (so, 1);
+	so_method(so, screen->tesla, 0x16b8, 1);
+	so_data  (so, 8);
+
+	/* Shared constant buffer */
+	screen->constbuf = ws->buffer_create(ws, 0, 0, 128 * 4 * 4);
+	if (nvws->res_init(&screen->vp_data_heap, 0, 128)) {
+		NOUVEAU_ERR("Error initialising constant buffer\n");
+		nv50_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	so_method(so, screen->tesla, 0x1280, 3);
+	so_reloc (so, screen->constbuf, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->constbuf, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, (NV50_CB_PMISC << 16) | 0x00001000);
+
+	/* Texture sampler/image unit setup - we abuse the constant buffer
+	 * upload mechanism for the moment to upload data to the tex config
+	 * blocks.  At some point we *may* want to go the NVIDIA way of doing
+	 * things?
+	 */
+	screen->tic = ws->buffer_create(ws, 0, 0, 32 * 8 * 4);
+	so_method(so, screen->tesla, 0x1280, 3);
+	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, (NV50_CB_TIC << 16) | 0x0800);
+	so_method(so, screen->tesla, 0x1574, 3);
+	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, 0x00000800);
+
+	screen->tsc = ws->buffer_create(ws, 0, 0, 32 * 8 * 4);
+	so_method(so, screen->tesla, 0x1280, 3);
+	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, (NV50_CB_TSC << 16) | 0x0800);
+	so_method(so, screen->tesla, 0x155c, 3);
+	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, 0x00000800);
+
+
+	/* Vertex array limits - max them out */
+	for (i = 0; i < 16; i++) {
+		so_method(so, screen->tesla, 0x1080 + (i * 8), 2);
+		so_data  (so, 0x000000ff);
+		so_data  (so, 0xffffffff);
+	}
+
+	so_method(so, screen->tesla, NV50TCL_DEPTH_RANGE_NEAR, 2);
+	so_data  (so, fui(0.0));
+	so_data  (so, fui(1.0));
+
+	so_method(so, screen->tesla, 0x1234, 1);
+	so_data  (so, 1);
+	so_method(so, screen->tesla, 0x1458, 1);
+	so_data  (so, 1);
+
+	so_emit(nvws, so);
+	so_ref(so, &screen->static_init);
+	nvws->push_flush(nvws, 0, NULL);
+
+	screen->pipe.winsys = ws;
+
+	screen->pipe.destroy = nv50_screen_destroy;
+
+	screen->pipe.get_name = nv50_screen_get_name;
+	screen->pipe.get_vendor = nv50_screen_get_vendor;
+	screen->pipe.get_param = nv50_screen_get_param;
+	screen->pipe.get_paramf = nv50_screen_get_paramf;
+
+	screen->pipe.is_format_supported = nv50_screen_is_format_supported;
+
+	nv50_screen_init_miptree_functions(&screen->pipe);
+	nv50_surface_init_screen_functions(&screen->pipe);
+
+	return &screen->pipe;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_screen.h b/src/gallium/drivers/nv50/nv50_screen.h
new file mode 100644
index 00000000000..400ddcef06d
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_screen.h
@@ -0,0 +1,33 @@
+#ifndef __NV50_SCREEN_H__
+#define __NV50_SCREEN_H__
+
+#include "pipe/p_screen.h"
+
+struct nv50_screen {
+	struct pipe_screen pipe;
+
+	struct nouveau_winsys *nvws;
+
+	unsigned cur_pctx;
+
+	struct nouveau_grobj *tesla;
+	struct nouveau_notifier *sync;
+
+	struct pipe_buffer *constbuf;
+	struct nouveau_resource *vp_data_heap;
+
+	struct pipe_buffer *tic;
+	struct pipe_buffer *tsc;
+
+	struct nouveau_stateobj *static_init;
+};
+
+static INLINE struct nv50_screen *
+nv50_screen(struct pipe_screen *screen)
+{
+	return (struct nv50_screen *)screen;
+}
+
+void nv50_surface_init_screen_functions(struct pipe_screen *);
+
+#endif
diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c
new file mode 100644
index 00000000000..95f9d408b5e
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_state.c
@@ -0,0 +1,638 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv50_context.h"
+#include "nv50_texture.h"
+
+#include "nouveau/nouveau_stateobj.h"
+
+static void *
+nv50_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nouveau_stateobj *so = so_new(64, 0);
+	struct nouveau_grobj *tesla = nv50_context(pipe)->screen->tesla;
+	struct nv50_blend_stateobj *bso = CALLOC_STRUCT(nv50_blend_stateobj);
+	unsigned cmask = 0, i;
+
+	/*XXX ignored:
+	 * 	- dither
+	 */
+
+	if (cso->blend_enable == 0) {
+		so_method(so, tesla, NV50TCL_BLEND_ENABLE(0), 8);
+		for (i = 0; i < 8; i++)
+			so_data(so, 0);
+	} else {
+		so_method(so, tesla, NV50TCL_BLEND_ENABLE(0), 8);
+		for (i = 0; i < 8; i++)
+			so_data(so, 1);
+		so_method(so, tesla, NV50TCL_BLEND_EQUATION_RGB, 5);
+		so_data  (so, nvgl_blend_eqn(cso->rgb_func));
+		so_data  (so, 0x4000 | nvgl_blend_func(cso->rgb_src_factor));
+		so_data  (so, 0x4000 | nvgl_blend_func(cso->rgb_dst_factor));
+		so_data  (so, nvgl_blend_eqn(cso->alpha_func));
+		so_data  (so, 0x4000 | nvgl_blend_func(cso->alpha_src_factor));
+		so_method(so, tesla, NV50TCL_BLEND_FUNC_DST_ALPHA, 1);
+		so_data  (so, 0x4000 | nvgl_blend_func(cso->alpha_dst_factor));
+	}
+
+	if (cso->logicop_enable == 0 ) {
+		so_method(so, tesla, NV50TCL_LOGIC_OP_ENABLE, 1);
+		so_data  (so, 0);
+	} else {
+		so_method(so, tesla, NV50TCL_LOGIC_OP_ENABLE, 2);
+		so_data  (so, 1);
+		so_data  (so, nvgl_logicop_func(cso->logicop_func));
+	}
+
+	if (cso->colormask & PIPE_MASK_R)
+		cmask |= (1 << 0);
+	if (cso->colormask & PIPE_MASK_G)
+		cmask |= (1 << 4);
+	if (cso->colormask & PIPE_MASK_B)
+		cmask |= (1 << 8);
+	if (cso->colormask & PIPE_MASK_A)
+		cmask |= (1 << 12);
+	so_method(so, tesla, NV50TCL_COLOR_MASK(0), 8);
+	for (i = 0; i < 8; i++)
+		so_data(so, cmask);
+
+	bso->pipe = *cso;
+	so_ref(so, &bso->so);
+	return (void *)bso;
+}
+
+static void
+nv50_blend_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->blend = hwcso;
+	nv50->dirty |= NV50_NEW_BLEND;
+}
+
+static void
+nv50_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_blend_stateobj *bso = hwcso;
+
+	so_ref(NULL, &bso->so);
+	FREE(bso);
+}
+
+static INLINE unsigned
+wrap_mode(unsigned wrap)
+{
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		return NV50TSC_1_0_WRAPS_REPEAT;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		return NV50TSC_1_0_WRAPS_MIRROR_REPEAT;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		return NV50TSC_1_0_WRAPS_CLAMP_TO_EDGE;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		return NV50TSC_1_0_WRAPS_CLAMP_TO_BORDER;
+	case PIPE_TEX_WRAP_CLAMP:
+		return NV50TSC_1_0_WRAPS_CLAMP;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+		return NV50TSC_1_0_WRAPS_MIRROR_CLAMP_TO_EDGE;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+		return NV50TSC_1_0_WRAPS_MIRROR_CLAMP_TO_BORDER;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+		return NV50TSC_1_0_WRAPS_MIRROR_CLAMP;
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		return NV50TSC_1_0_WRAPS_REPEAT;
+	}
+}
+static void *
+nv50_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	unsigned *tsc = CALLOC(8, sizeof(unsigned));
+
+	tsc[0] = (0x00024000 |
+		  (wrap_mode(cso->wrap_s) << 0) |
+		  (wrap_mode(cso->wrap_t) << 3) |
+		  (wrap_mode(cso->wrap_r) << 6));
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		tsc[1] |= NV50TSC_1_1_MAGF_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		tsc[1] |= NV50TSC_1_1_MAGF_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		tsc[1] |= NV50TSC_1_1_MINF_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		tsc[1] |= NV50TSC_1_1_MINF_NEAREST;
+		break;
+	}
+
+	switch (cso->min_mip_filter) {
+	case PIPE_TEX_MIPFILTER_LINEAR:
+		tsc[1] |= NV50TSC_1_1_MIPF_LINEAR;
+		break;
+	case PIPE_TEX_MIPFILTER_NEAREST:
+		tsc[1] |= NV50TSC_1_1_MIPF_NEAREST;
+		break;
+	case PIPE_TEX_MIPFILTER_NONE:
+	default:
+		tsc[1] |= NV50TSC_1_1_MIPF_NONE;
+		break;
+	}
+
+	return (void *)tsc;
+}
+
+static void
+nv50_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	int i;
+
+	nv50->sampler_nr = nr;
+	for (i = 0; i < nv50->sampler_nr; i++)
+		nv50->sampler[i] = sampler[i];
+
+	nv50->dirty |= NV50_NEW_SAMPLER;
+}
+
+static void
+nv50_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nv50_set_sampler_texture(struct pipe_context *pipe, unsigned nr,
+			 struct pipe_texture **pt)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	int i;
+
+	for (i = 0; i < nr; i++)
+		pipe_texture_reference((void *)&nv50->miptree[i], pt[i]);
+	for (i = nr; i < nv50->miptree_nr; i++)
+		pipe_texture_reference((void *)&nv50->miptree[i], NULL);
+
+	nv50->miptree_nr = nr;
+	nv50->dirty |= NV50_NEW_TEXTURE;
+}
+
+static void *
+nv50_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nouveau_stateobj *so = so_new(64, 0);
+	struct nouveau_grobj *tesla = nv50_context(pipe)->screen->tesla;
+	struct nv50_rasterizer_stateobj *rso =
+		CALLOC_STRUCT(nv50_rasterizer_stateobj);
+
+	/*XXX: ignored
+	 * 	- light_twosize
+	 * 	- point_smooth
+	 * 	- multisample
+	 * 	- point_sprite / sprite_coord_mode
+	 */
+
+	so_method(so, tesla, NV50TCL_SHADE_MODEL, 1);
+	so_data  (so, cso->flatshade ? NV50TCL_SHADE_MODEL_FLAT :
+				       NV50TCL_SHADE_MODEL_SMOOTH);
+
+	so_method(so, tesla, NV50TCL_LINE_WIDTH, 1);
+	so_data  (so, fui(cso->line_width));
+	so_method(so, tesla, NV50TCL_LINE_SMOOTH_ENABLE, 1);
+	so_data  (so, cso->line_smooth ? 1 : 0);
+	if (cso->line_stipple_enable) {
+		so_method(so, tesla, NV50TCL_LINE_STIPPLE_ENABLE, 1);
+		so_data  (so, 1);
+		so_method(so, tesla, NV50TCL_LINE_STIPPLE_PATTERN, 1);
+		so_data  (so, (cso->line_stipple_pattern << 8) |
+			       cso->line_stipple_factor);
+	} else {
+		so_method(so, tesla, NV50TCL_LINE_STIPPLE_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, tesla, NV50TCL_POINT_SIZE, 1);
+	so_data  (so, fui(cso->point_size));
+
+	so_method(so, tesla, NV50TCL_POLYGON_MODE_FRONT, 3);
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+	} else {
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+	}
+	so_data(so, cso->poly_smooth ? 1 : 0);
+
+	so_method(so, tesla, NV50TCL_CULL_FACE_ENABLE, 3);
+	so_data  (so, cso->cull_mode != PIPE_WINDING_NONE);
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		so_data(so, NV50TCL_FRONT_FACE_CCW);
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV50TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV50TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV50TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV50TCL_CULL_FACE_BACK);
+			break;
+		}
+	} else {
+		so_data(so, NV50TCL_FRONT_FACE_CW);
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV50TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV50TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV50TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV50TCL_CULL_FACE_BACK);
+			break;
+		}
+	}
+
+	so_method(so, tesla, NV50TCL_POLYGON_STIPPLE_ENABLE, 1);
+	so_data  (so, cso->poly_stipple_enable ? 1 : 0);
+
+	so_method(so, tesla, NV50TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_POINT) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_POINT))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_LINE) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_LINE))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_FILL) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_FILL))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+
+	if (cso->offset_cw || cso->offset_ccw) {
+		so_method(so, tesla, NV50TCL_POLYGON_OFFSET_FACTOR, 1);
+		so_data  (so, fui(cso->offset_scale));
+		so_method(so, tesla, NV50TCL_POLYGON_OFFSET_UNITS, 1);
+		so_data  (so, fui(cso->offset_units));
+	}
+
+	rso->pipe = *cso;
+	so_ref(so, &rso->so);
+	return (void *)rso;
+}
+
+static void
+nv50_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->rasterizer = hwcso;
+	nv50->dirty |= NV50_NEW_RASTERIZER;
+}
+
+static void
+nv50_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_rasterizer_stateobj *rso = hwcso;
+
+	so_ref(NULL, &rso->so);
+	FREE(rso);
+}
+
+static void *
+nv50_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nouveau_grobj *tesla = nv50_context(pipe)->screen->tesla;
+	struct nv50_zsa_stateobj *zsa = CALLOC_STRUCT(nv50_zsa_stateobj);
+	struct nouveau_stateobj *so = so_new(64, 0);
+
+	so_method(so, tesla, NV50TCL_DEPTH_WRITE_ENABLE, 1);
+	so_data  (so, cso->depth.writemask ? 1 : 0);
+	if (cso->depth.enabled) {
+		so_method(so, tesla, NV50TCL_DEPTH_TEST_ENABLE, 1);
+		so_data  (so, 1);
+		so_method(so, tesla, NV50TCL_DEPTH_TEST_FUNC, 1);
+		so_data  (so, nvgl_comparison_op(cso->depth.func));
+	} else {
+		so_method(so, tesla, NV50TCL_DEPTH_TEST_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	/*XXX: yes, I know they're backwards.. header needs fixing */
+	if (cso->stencil[0].enabled) {
+		so_method(so, tesla, NV50TCL_STENCIL_BACK_ENABLE, 5);
+		so_data  (so, 1);
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zpass_op));
+		so_data  (so, nvgl_comparison_op(cso->stencil[0].func));
+		so_method(so, tesla, NV50TCL_STENCIL_BACK_FUNC_REF, 3);
+		so_data  (so, cso->stencil[0].ref_value);
+		so_data  (so, cso->stencil[0].write_mask);
+		so_data  (so, cso->stencil[0].value_mask);
+	} else {
+		so_method(so, tesla, NV50TCL_STENCIL_BACK_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	if (cso->stencil[1].enabled) {
+		so_method(so, tesla, NV50TCL_STENCIL_FRONT_ENABLE, 8);
+		so_data  (so, 1);
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zpass_op));
+		so_data  (so, nvgl_comparison_op(cso->stencil[1].func));
+		so_data  (so, cso->stencil[1].ref_value);
+		so_data  (so, cso->stencil[1].write_mask);
+		so_data  (so, cso->stencil[1].value_mask);
+	} else {
+		so_method(so, tesla, NV50TCL_STENCIL_FRONT_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	if (cso->alpha.enabled) {
+		so_method(so, tesla, NV50TCL_ALPHA_TEST_ENABLE, 1);
+		so_data  (so, 1);
+		so_method(so, tesla, NV50TCL_ALPHA_TEST_REF, 2);
+		so_data  (so, fui(cso->alpha.ref));
+		so_data  (so, nvgl_comparison_op(cso->alpha.func));
+	} else {
+		so_method(so, tesla, NV50TCL_ALPHA_TEST_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	zsa->pipe = *cso;
+	so_ref(so, &zsa->so);
+	return (void *)zsa;
+}
+
+static void
+nv50_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->zsa = hwcso;
+	nv50->dirty |= NV50_NEW_ZSA;
+}
+
+static void
+nv50_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_zsa_stateobj *zsa = hwcso;
+
+	so_ref(NULL, &zsa->so);
+	FREE(zsa);
+}
+
+static void *
+nv50_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv50_program *p = CALLOC_STRUCT(nv50_program);
+
+	p->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	p->type = PIPE_SHADER_VERTEX;
+	tgsi_scan_shader(p->pipe.tokens, &p->info);
+	return (void *)p;
+}
+
+static void
+nv50_vp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->vertprog = hwcso;
+	nv50->dirty |= NV50_NEW_VERTPROG;
+}
+
+static void
+nv50_vp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nv50_program *p = hwcso;
+
+	nv50_program_destroy(nv50, p);
+	FREE((void*)p->pipe.tokens);
+	FREE(p);
+}
+
+static void *
+nv50_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv50_program *p = CALLOC_STRUCT(nv50_program);
+
+	p->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	p->type = PIPE_SHADER_FRAGMENT;
+	tgsi_scan_shader(p->pipe.tokens, &p->info);
+	return (void *)p;
+}
+
+static void
+nv50_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->fragprog = hwcso;
+	nv50->dirty |= NV50_NEW_FRAGPROG;
+}
+
+static void
+nv50_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nv50_program *p = hwcso;
+
+	nv50_program_destroy(nv50, p);
+	FREE((void*)p->pipe.tokens);
+	FREE(p);
+}
+
+static void
+nv50_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->blend_colour = *bcol;
+	nv50->dirty |= NV50_NEW_BLEND_COLOUR;
+}
+
+static void
+nv50_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+}
+
+static void
+nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	if (shader == PIPE_SHADER_VERTEX) {
+		nv50->constbuf[PIPE_SHADER_VERTEX] = buf->buffer;
+		nv50->dirty |= NV50_NEW_VERTPROG_CB;
+	} else
+	if (shader == PIPE_SHADER_FRAGMENT) {
+		nv50->constbuf[PIPE_SHADER_FRAGMENT] = buf->buffer;
+		nv50->dirty |= NV50_NEW_FRAGPROG_CB;
+	}
+}
+
+static void
+nv50_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->framebuffer = *fb;
+	nv50->dirty |= NV50_NEW_FRAMEBUFFER;
+}
+
+static void
+nv50_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->stipple = *stipple;
+	nv50->dirty |= NV50_NEW_STIPPLE;
+}
+
+static void
+nv50_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->scissor = *s;
+	nv50->dirty |= NV50_NEW_SCISSOR;
+}
+
+static void
+nv50_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->viewport = *vpt;
+	nv50->dirty |= NV50_NEW_VIEWPORT;
+}
+
+static void
+nv50_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	memcpy(nv50->vtxbuf, vb, sizeof(*vb) * count);
+	nv50->vtxbuf_nr = count;
+
+	nv50->dirty |= NV50_NEW_ARRAYS;
+}
+
+static void
+nv50_set_vertex_elements(struct pipe_context *pipe, unsigned count,
+			 const struct pipe_vertex_element *ve)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	memcpy(nv50->vtxelt, ve, sizeof(*ve) * count);
+	nv50->vtxelt_nr = count;
+
+	nv50->dirty |= NV50_NEW_ARRAYS;
+}
+
+void
+nv50_init_state_functions(struct nv50_context *nv50)
+{
+	nv50->pipe.create_blend_state = nv50_blend_state_create;
+	nv50->pipe.bind_blend_state = nv50_blend_state_bind;
+	nv50->pipe.delete_blend_state = nv50_blend_state_delete;
+
+	nv50->pipe.create_sampler_state = nv50_sampler_state_create;
+	nv50->pipe.bind_sampler_states = nv50_sampler_state_bind;
+	nv50->pipe.delete_sampler_state = nv50_sampler_state_delete;
+	nv50->pipe.set_sampler_textures = nv50_set_sampler_texture;
+
+	nv50->pipe.create_rasterizer_state = nv50_rasterizer_state_create;
+	nv50->pipe.bind_rasterizer_state = nv50_rasterizer_state_bind;
+	nv50->pipe.delete_rasterizer_state = nv50_rasterizer_state_delete;
+
+	nv50->pipe.create_depth_stencil_alpha_state =
+		nv50_depth_stencil_alpha_state_create;
+	nv50->pipe.bind_depth_stencil_alpha_state =
+		nv50_depth_stencil_alpha_state_bind;
+	nv50->pipe.delete_depth_stencil_alpha_state =
+		nv50_depth_stencil_alpha_state_delete;
+
+	nv50->pipe.create_vs_state = nv50_vp_state_create;
+	nv50->pipe.bind_vs_state = nv50_vp_state_bind;
+	nv50->pipe.delete_vs_state = nv50_vp_state_delete;
+
+	nv50->pipe.create_fs_state = nv50_fp_state_create;
+	nv50->pipe.bind_fs_state = nv50_fp_state_bind;
+	nv50->pipe.delete_fs_state = nv50_fp_state_delete;
+
+	nv50->pipe.set_blend_color = nv50_set_blend_color;
+	nv50->pipe.set_clip_state = nv50_set_clip_state;
+	nv50->pipe.set_constant_buffer = nv50_set_constant_buffer;
+	nv50->pipe.set_framebuffer_state = nv50_set_framebuffer_state;
+	nv50->pipe.set_polygon_stipple = nv50_set_polygon_stipple;
+	nv50->pipe.set_scissor_state = nv50_set_scissor_state;
+	nv50->pipe.set_viewport_state = nv50_set_viewport_state;
+
+	nv50->pipe.set_vertex_buffers = nv50_set_vertex_buffers;
+	nv50->pipe.set_vertex_elements = nv50_set_vertex_elements;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c
new file mode 100644
index 00000000000..198e25f4482
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_state_validate.c
@@ -0,0 +1,309 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "nv50_context.h"
+#include "nouveau/nouveau_stateobj.h"
+
+static void
+nv50_state_validate_fb(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *so = so_new(128, 18);
+	struct pipe_framebuffer_state *fb = &nv50->framebuffer;
+	unsigned i, w, h, gw = 0;
+
+	for (i = 0; i < fb->num_cbufs; i++) {
+		if (!gw) {
+			w = fb->cbufs[i]->width;
+			h = fb->cbufs[i]->height;
+			gw = 1;
+		} else {
+			assert(w == fb->cbufs[i]->width);
+			assert(h == fb->cbufs[i]->height);
+		}
+
+		so_method(so, tesla, NV50TCL_RT_HORIZ(i), 2);
+		so_data  (so, fb->cbufs[i]->width);
+		so_data  (so, fb->cbufs[i]->height);
+
+		so_method(so, tesla, NV50TCL_RT_ADDRESS_HIGH(i), 5);
+		so_reloc (so, fb->cbufs[i]->buffer, fb->cbufs[i]->offset,
+			  NOUVEAU_BO_VRAM | NOUVEAU_BO_HIGH, 0, 0);
+		so_reloc (so, fb->cbufs[i]->buffer, fb->cbufs[i]->offset,
+			  NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW, 0, 0);
+		switch (fb->cbufs[i]->format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+			so_data(so, 0xcf);
+			break;
+		case PIPE_FORMAT_R5G6B5_UNORM:
+			so_data(so, 0xe8);
+			break;
+		default:
+			NOUVEAU_ERR("AIIII unknown format %s\n",
+				    pf_name(fb->cbufs[i]->format));
+			so_data(so, 0xe6);
+			break;
+		}
+		so_data(so, 0x00000000);
+		so_data(so, 0x00000000);
+
+		so_method(so, tesla, 0x1224, 1);
+		so_data  (so, 1);
+	}
+
+	if (fb->zsbuf) {
+		if (!gw) {
+			w = fb->zsbuf->width;
+			h = fb->zsbuf->height;
+			gw = 1;
+		} else {
+			assert(w == fb->zsbuf->width);
+			assert(h == fb->zsbuf->height);
+		}
+
+		so_method(so, tesla, NV50TCL_ZETA_ADDRESS_HIGH, 5);
+		so_reloc (so, fb->zsbuf->buffer, fb->zsbuf->offset,
+			  NOUVEAU_BO_VRAM | NOUVEAU_BO_HIGH, 0, 0);
+		so_reloc (so, fb->zsbuf->buffer, fb->zsbuf->offset,
+			  NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW, 0, 0);
+		switch (fb->zsbuf->format) {
+		case PIPE_FORMAT_Z24S8_UNORM:
+			so_data(so, 0x16);
+			break;
+		case PIPE_FORMAT_Z16_UNORM:
+			so_data(so, 0x15);
+			break;
+		default:
+			NOUVEAU_ERR("AIIII unknown format %s\n",
+				    pf_name(fb->zsbuf->format));
+			so_data(so, 0x16);
+			break;
+		}
+		so_data(so, 0x00000000);
+		so_data(so, 0x00000000);
+
+		so_method(so, tesla, 0x1538, 1);
+		so_data  (so, 1);
+		so_method(so, tesla, 0x1228, 3);
+		so_data  (so, fb->zsbuf->width);
+		so_data  (so, fb->zsbuf->height);
+		so_data  (so, 0x00010001);
+	}
+
+	so_method(so, tesla, NV50TCL_VIEWPORT_HORIZ, 2);
+	so_data  (so, w << 16);
+	so_data  (so, h << 16);
+	so_method(so, tesla, 0x0e04, 2);
+	so_data  (so, w << 16);
+	so_data  (so, h << 16);
+	so_method(so, tesla, 0xdf8, 2);
+	so_data  (so, 0);
+	so_data  (so, h);
+
+	so_ref(so, &nv50->state.fb);
+}
+
+static void
+nv50_state_emit(struct nv50_context *nv50)
+{
+	struct nv50_screen *screen = nv50->screen;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	if (nv50->pctx_id != screen->cur_pctx) {
+		nv50->state.dirty |= 0xffffffff;
+		screen->cur_pctx = nv50->pctx_id;
+	}
+
+	if (nv50->state.dirty & NV50_NEW_FRAMEBUFFER)
+		so_emit(nvws, nv50->state.fb);
+	if (nv50->state.dirty & NV50_NEW_BLEND)
+		so_emit(nvws, nv50->state.blend);
+	if (nv50->state.dirty & NV50_NEW_ZSA)
+		so_emit(nvws, nv50->state.zsa);
+	if (nv50->state.dirty & NV50_NEW_VERTPROG)
+		so_emit(nvws, nv50->state.vertprog);
+	if (nv50->state.dirty & NV50_NEW_FRAGPROG)
+		so_emit(nvws, nv50->state.fragprog);
+	if (nv50->state.dirty & NV50_NEW_RASTERIZER)
+		so_emit(nvws, nv50->state.rast);
+	if (nv50->state.dirty & NV50_NEW_BLEND_COLOUR)
+		so_emit(nvws, nv50->state.blend_colour);
+	if (nv50->state.dirty & NV50_NEW_STIPPLE)
+		so_emit(nvws, nv50->state.stipple);
+	if (nv50->state.dirty & NV50_NEW_SCISSOR)
+		so_emit(nvws, nv50->state.scissor);
+	if (nv50->state.dirty & NV50_NEW_VIEWPORT)
+		so_emit(nvws, nv50->state.viewport);
+	if (nv50->state.dirty & NV50_NEW_SAMPLER)
+		so_emit(nvws, nv50->state.tsc_upload);
+	if (nv50->state.dirty & NV50_NEW_TEXTURE)
+		so_emit(nvws, nv50->state.tic_upload);
+	if (nv50->state.dirty & NV50_NEW_ARRAYS) {
+		so_emit(nvws, nv50->state.vtxfmt);
+		so_emit(nvws, nv50->state.vtxbuf);
+	}
+	nv50->state.dirty = 0;
+
+	so_emit_reloc_markers(nvws, nv50->state.fb);
+	so_emit_reloc_markers(nvws, nv50->state.vertprog);
+	so_emit_reloc_markers(nvws, nv50->state.fragprog);
+	so_emit_reloc_markers(nvws, nv50->state.vtxbuf);
+	so_emit_reloc_markers(nvws, nv50->screen->static_init);
+}
+
+boolean
+nv50_state_validate(struct nv50_context *nv50)
+{
+	const struct pipe_framebuffer_state *fb = &nv50->framebuffer;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *so;
+	unsigned i;
+
+	for (i = 0; i < fb->num_cbufs; i++)
+		fb->cbufs[i]->status = PIPE_SURFACE_STATUS_DEFINED;
+
+	if (fb->zsbuf)
+		fb->zsbuf->status = PIPE_SURFACE_STATUS_DEFINED;
+
+	if (nv50->dirty & NV50_NEW_FRAMEBUFFER)
+		nv50_state_validate_fb(nv50);
+
+	if (nv50->dirty & NV50_NEW_BLEND)
+		so_ref(nv50->blend->so, &nv50->state.blend);
+
+	if (nv50->dirty & NV50_NEW_ZSA)
+		so_ref(nv50->zsa->so, &nv50->state.zsa);
+
+	if (nv50->dirty & (NV50_NEW_VERTPROG | NV50_NEW_VERTPROG_CB))
+		nv50_vertprog_validate(nv50);
+
+	if (nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_FRAGPROG_CB))
+		nv50_fragprog_validate(nv50);
+
+	if (nv50->dirty & NV50_NEW_RASTERIZER)
+		so_ref(nv50->rasterizer->so, &nv50->state.rast);
+
+	if (nv50->dirty & NV50_NEW_BLEND_COLOUR) {
+		so = so_new(5, 0);
+		so_method(so, tesla, NV50TCL_BLEND_COLOR(0), 4);
+		so_data  (so, fui(nv50->blend_colour.color[0]));
+		so_data  (so, fui(nv50->blend_colour.color[1]));
+		so_data  (so, fui(nv50->blend_colour.color[2]));
+		so_data  (so, fui(nv50->blend_colour.color[3]));
+		so_ref(so, &nv50->state.blend_colour);
+	}
+
+	if (nv50->dirty & NV50_NEW_STIPPLE) {
+		so = so_new(33, 0);
+		so_method(so, tesla, NV50TCL_POLYGON_STIPPLE_PATTERN(0), 32);
+		for (i = 0; i < 32; i++)
+			so_data(so, nv50->stipple.stipple[i]);
+		so_ref(so, &nv50->state.stipple);
+	}
+
+	if (nv50->dirty & (NV50_NEW_SCISSOR | NV50_NEW_RASTERIZER)) {
+		struct pipe_rasterizer_state *rast = &nv50->rasterizer->pipe;
+		struct pipe_scissor_state *s = &nv50->scissor;
+
+		if (nv50->state.scissor &&
+		    (rast->scissor == 0 && nv50->state.scissor_enabled == 0))
+			goto scissor_uptodate;
+		nv50->state.scissor_enabled = rast->scissor;
+
+		so = so_new(3, 0);
+		so_method(so, tesla, 0x0ff4, 2);
+		if (nv50->state.scissor_enabled) {
+			so_data(so, ((s->maxx - s->minx) << 16) | s->minx);
+			so_data(so, ((s->maxy - s->miny) << 16) | s->miny);
+		} else {
+			so_data(so, (8192 << 16));
+			so_data(so, (8192 << 16));
+		}
+		so_ref(so, &nv50->state.scissor);
+		nv50->state.dirty |= NV50_NEW_SCISSOR;
+	}
+scissor_uptodate:
+
+	if (nv50->dirty & NV50_NEW_VIEWPORT) {
+		unsigned bypass;
+
+		if (!nv50->rasterizer->pipe.bypass_clipping)
+			bypass = 0;
+		else
+			bypass = 1;
+
+		if (nv50->state.viewport &&
+		    (bypass || !(nv50->dirty & NV50_NEW_VIEWPORT)) &&
+		    nv50->state.viewport_bypass == bypass)
+			goto viewport_uptodate;
+		nv50->state.viewport_bypass = bypass;
+
+		so = so_new(12, 0);
+		if (!bypass) {
+			so_method(so, tesla, NV50TCL_VIEWPORT_UNK1(0), 3);
+			so_data  (so, fui(nv50->viewport.translate[0]));
+			so_data  (so, fui(nv50->viewport.translate[1]));
+			so_data  (so, fui(nv50->viewport.translate[2]));
+			so_method(so, tesla, NV50TCL_VIEWPORT_UNK0(0), 3);
+			so_data  (so, fui(nv50->viewport.scale[0]));
+			so_data  (so, fui(-nv50->viewport.scale[1]));
+			so_data  (so, fui(nv50->viewport.scale[2]));
+			so_method(so, tesla, 0x192c, 1);
+			so_data  (so, 1);
+			so_method(so, tesla, 0x0f90, 1);
+			so_data  (so, 0);
+		} else {
+			so_method(so, tesla, 0x192c, 1);
+			so_data  (so, 0);
+			so_method(so, tesla, 0x0f90, 1);
+			so_data  (so, 1);
+		}
+
+		so_ref(so, &nv50->state.viewport);
+	}
+viewport_uptodate:
+
+	if (nv50->dirty & NV50_NEW_SAMPLER) {
+		int i;
+
+		so = so_new(nv50->sampler_nr * 8 + 3, 0);
+		so_method(so, tesla, 0x0f00, 1);
+		so_data  (so, NV50_CB_TSC);
+		so_method(so, tesla, 0x40000f04, nv50->sampler_nr * 8);
+		for (i = 0; i < nv50->sampler_nr; i++)
+			so_datap (so, nv50->sampler[i], 8);
+		so_ref(so, &nv50->state.tsc_upload);
+	}
+
+	if (nv50->dirty & NV50_NEW_TEXTURE)
+		nv50_tex_validate(nv50);
+
+	if (nv50->dirty & NV50_NEW_ARRAYS)
+		nv50_vbo_validate(nv50);
+
+	nv50->state.dirty |= nv50->dirty;
+	nv50->dirty = 0;
+	nv50_state_emit(nv50);
+
+	return TRUE;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_surface.c b/src/gallium/drivers/nv50/nv50_surface.c
new file mode 100644
index 00000000000..5bf97d3a6bf
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_surface.c
@@ -0,0 +1,123 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "nv50_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
+
+#include "util/u_tile.h"
+
+static void
+nv50_surface_copy(struct pipe_context *pipe, boolean flip,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nv50_context *nv50 = (struct nv50_context *)pipe;
+	struct nouveau_winsys *nvws = nv50->screen->nvws;
+
+	if (flip) {
+		desty += height;
+		while (height--) {
+			nvws->surface_copy(nvws, dest, destx, desty--, src,
+					   srcx, srcy++, width, 1);
+		}
+	} else {
+		nvws->surface_copy(nvws, dest, destx, desty, src, srcx, srcy,
+				   width, height);
+	}
+}
+
+static void
+nv50_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nv50_context *nv50 = (struct nv50_context *)pipe;
+	struct nouveau_winsys *nvws = nv50->screen->nvws;
+
+	nvws->surface_fill(nvws, dest, destx, desty, width, height, value);
+}
+
+static void *
+nv50_surface_map(struct pipe_screen *screen, struct pipe_surface *ps,
+		 unsigned flags )
+{
+	struct nouveau_winsys *nvws = nv50_screen(screen)->nvws;
+	struct pipe_winsys *ws = screen->winsys;
+	struct nv50_surface *s = nv50_surface(ps);
+	struct nv50_surface m = *s;
+	void *map;
+
+	if (!s->untiled) {
+		s->untiled = ws->buffer_create(ws, 0, 0, ps->buffer->size);
+
+		m.base.buffer = s->untiled;
+		nvws->surface_copy(nvws, &m.base, 0, 0, &s->base, 0, 0,
+					 ps->width, ps->height);
+	}
+
+	/* Map original tiled surface to disallow it being validated while
+	 * untiled mirror is mapped.
+	 */
+	ws->buffer_map(ws, ps->buffer, flags);
+
+	map = ws->buffer_map(ws, s->untiled, flags);
+	if (!map)
+		return NULL;
+
+	return map;
+}
+
+static void
+nv50_surface_unmap(struct pipe_screen *pscreen, struct pipe_surface *ps)
+{
+	struct nouveau_winsys *nvws = nv50_screen(pscreen)->nvws;
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv50_surface *s = nv50_surface(ps);
+	struct nv50_surface m = *s;
+
+	ws->buffer_unmap(ws, s->untiled);
+	ws->buffer_unmap(ws, ps->buffer);
+
+	m.base.buffer = s->untiled;
+	nvws->surface_copy(nvws, &s->base, 0, 0, &m.base, 0, 0,
+				 ps->width, ps->height);
+
+	pipe_buffer_reference(pscreen, &s->untiled, NULL);
+}
+
+void
+nv50_init_surface_functions(struct nv50_context *nv50)
+{
+	nv50->pipe.surface_copy = nv50_surface_copy;
+	nv50->pipe.surface_fill = nv50_surface_fill;
+}
+
+void
+nv50_surface_init_screen_functions(struct pipe_screen *pscreen)
+{
+	pscreen->surface_map = nv50_surface_map;
+	pscreen->surface_unmap = nv50_surface_unmap;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_tex.c b/src/gallium/drivers/nv50/nv50_tex.c
new file mode 100644
index 00000000000..fde3c97c059
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_tex.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "nv50_context.h"
+#include "nv50_texture.h"
+
+#include "nouveau/nouveau_stateobj.h"
+
+static int
+nv50_tex_construct(struct nouveau_stateobj *so, struct nv50_miptree *mt)
+{
+	switch (mt->base.format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_8_8_8_8);
+		break;
+	case PIPE_FORMAT_A1R5G5B5_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_1_5_5_5);
+		break;
+	case PIPE_FORMAT_A4R4G4B4_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_4_4_4_4);
+		break;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_ONE | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_5_6_5);
+		break;
+	case PIPE_FORMAT_L8_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_ONE | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C0 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_8);
+		break;
+	case PIPE_FORMAT_A8_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_C0 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_ZERO | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_ZERO | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_ZERO | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_8);
+		break;
+	case PIPE_FORMAT_I8_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_C0 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C0 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_8);
+		break;
+	case PIPE_FORMAT_A8L8_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_C1 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C0 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_8_8);
+		break;
+	default:
+		return 1;
+	}
+
+	so_reloc(so, mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW, 0, 0);
+	so_data (so, 0xd0005000);
+	so_data (so, 0x00300000);
+	so_data (so, mt->base.width[0]);
+	so_data (so, (mt->base.depth[0] << 16) | mt->base.height[0]);
+	so_data (so, 0x03000000);
+	so_reloc(so, mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_HIGH, 0, 0);
+
+	return 0;
+}
+
+void
+nv50_tex_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *so;
+	int i;
+
+	so = so_new(nv50->miptree_nr * 8 + 3, nv50->miptree_nr * 2);
+	so_method(so, tesla, 0x0f00, 1);
+	so_data  (so, NV50_CB_TIC);
+	so_method(so, tesla, 0x40000f04, nv50->miptree_nr * 8);
+	for (i = 0; i < nv50->miptree_nr; i++) {
+		if (nv50_tex_construct(so, nv50->miptree[i])) {
+			NOUVEAU_ERR("failed tex validate\n");
+			so_ref(NULL, &so);
+			return;
+		}
+	}
+
+	so_ref(so, &nv50->state.tic_upload);
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_texture.h b/src/gallium/drivers/nv50/nv50_texture.h
new file mode 100644
index 00000000000..6861d67b4d5
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_texture.h
@@ -0,0 +1,126 @@
+#ifndef __NV50_TEXTURE_H__
+#define __NV50_TEXTURE_H__
+
+/* It'd be really nice to have these in nouveau_class.h generated by
+ * renouveau like the rest of the object header - but not sure it can
+ * handle non-object stuff nicely - need to look into it.
+ */
+
+/* Texture image control block */
+#define NV50TIC_0_0_MAPA_MASK                                     0x38000000
+#define NV50TIC_0_0_MAPA_ZERO                                     0x00000000
+#define NV50TIC_0_0_MAPA_C0                                       0x10000000
+#define NV50TIC_0_0_MAPA_C1                                       0x18000000
+#define NV50TIC_0_0_MAPA_C2                                       0x20000000
+#define NV50TIC_0_0_MAPA_C3                                       0x28000000
+#define NV50TIC_0_0_MAPA_ONE                                      0x38000000
+#define NV50TIC_0_0_MAPR_MASK                                     0x07000000
+#define NV50TIC_0_0_MAPR_ZERO                                     0x00000000
+#define NV50TIC_0_0_MAPR_C0                                       0x02000000
+#define NV50TIC_0_0_MAPR_C1                                       0x03000000
+#define NV50TIC_0_0_MAPR_C2                                       0x04000000
+#define NV50TIC_0_0_MAPR_C3                                       0x05000000
+#define NV50TIC_0_0_MAPR_ONE                                      0x07000000
+#define NV50TIC_0_0_MAPG_MASK                                     0x00e00000
+#define NV50TIC_0_0_MAPG_ZERO                                     0x00000000
+#define NV50TIC_0_0_MAPG_C0                                       0x00400000
+#define NV50TIC_0_0_MAPG_C1                                       0x00600000
+#define NV50TIC_0_0_MAPG_C2                                       0x00800000
+#define NV50TIC_0_0_MAPG_C3                                       0x00a00000
+#define NV50TIC_0_0_MAPG_ONE                                      0x00e00000
+#define NV50TIC_0_0_MAPB_MASK                                     0x001c0000
+#define NV50TIC_0_0_MAPB_ZERO                                     0x00000000
+#define NV50TIC_0_0_MAPB_C0                                       0x00080000
+#define NV50TIC_0_0_MAPB_C1                                       0x000c0000
+#define NV50TIC_0_0_MAPB_C2                                       0x00100000
+#define NV50TIC_0_0_MAPB_C3                                       0x00140000
+#define NV50TIC_0_0_MAPB_ONE                                      0x001c0000
+#define NV50TIC_0_0_TYPEA_MASK                                    0x00038000
+#define NV50TIC_0_0_TYPEA_UNORM                                   0x00010000
+#define NV50TIC_0_0_TYPER_MASK                                    0x00007000
+#define NV50TIC_0_0_TYPER_UNORM                                   0x00002000
+#define NV50TIC_0_0_TYPEG_MASK                                    0x00000e00
+#define NV50TIC_0_0_TYPEG_UNORM                                   0x00000400
+#define NV50TIC_0_0_TYPEB_MASK                                    0x000001c0
+#define NV50TIC_0_0_TYPEB_UNORM                                   0x00000080
+#define NV50TIC_0_0_FMT_MASK                                      0x0000003c
+#define NV50TIC_0_0_FMT_8_8_8_8                                   0x00000008
+#define NV50TIC_0_0_FMT_4_4_4_4                                   0x00000012
+#define NV50TIC_0_0_FMT_1_5_5_5                                   0x00000013
+#define NV50TIC_0_0_FMT_5_6_5                                     0x00000015
+#define NV50TIC_0_0_FMT_8_8                                       0x00000018
+#define NV50TIC_0_0_FMT_8                                         0x0000001d
+
+#define NV50TIC_0_1_OFFSET_LOW_MASK                               0xffffffff
+#define NV50TIC_0_1_OFFSET_LOW_SHIFT                                       0
+
+#define NV50TIC_0_2_UNKNOWN_MASK                                  0xffffffff
+
+#define NV50TIC_0_3_UNKNOWN_MASK                                  0xffffffff
+
+#define NV50TIC_0_4_WIDTH_MASK                                    0x0000ffff
+#define NV50TIC_0_4_WIDTH_SHIFT                                            0
+
+#define NV50TIC_0_5_DEPTH_MASK                                    0xffff0000
+#define NV50TIC_0_5_DEPTH_SHIFT                                           16
+#define NV50TIC_0_5_HEIGHT_MASK                                   0x0000ffff
+#define NV50TIC_0_5_HEIGHT_SHIFT                                           0
+
+#define NV50TIC_0_6_UNKNOWN_MASK                                  0xffffffff
+
+#define NV50TIC_0_7_OFFSET_HIGH_MASK                              0xffffffff
+#define NV50TIC_0_7_OFFSET_HIGH_SHIFT                                      0
+
+/* Texture sampler control block */
+#define NV50TSC_1_0_WRAPS_MASK                                   0x00000007
+#define NV50TSC_1_0_WRAPS_REPEAT                                 0x00000000
+#define NV50TSC_1_0_WRAPS_MIRROR_REPEAT                          0x00000001
+#define NV50TSC_1_0_WRAPS_CLAMP_TO_EDGE                          0x00000002
+#define NV50TSC_1_0_WRAPS_CLAMP_TO_BORDER                        0x00000003
+#define NV50TSC_1_0_WRAPS_CLAMP                                  0x00000004
+#define NV50TSC_1_0_WRAPS_MIRROR_CLAMP_TO_EDGE                   0x00000005
+#define NV50TSC_1_0_WRAPS_MIRROR_CLAMP_TO_BORDER                 0x00000006
+#define NV50TSC_1_0_WRAPS_MIRROR_CLAMP                           0x00000007
+#define NV50TSC_1_0_WRAPT_MASK                                   0x00000038
+#define NV50TSC_1_0_WRAPT_REPEAT                                 0x00000000
+#define NV50TSC_1_0_WRAPT_MIRROR_REPEAT                          0x00000008
+#define NV50TSC_1_0_WRAPT_CLAMP_TO_EDGE                          0x00000010
+#define NV50TSC_1_0_WRAPT_CLAMP_TO_BORDER                        0x00000018
+#define NV50TSC_1_0_WRAPT_CLAMP                                  0x00000020
+#define NV50TSC_1_0_WRAPT_MIRROR_CLAMP_TO_EDGE                   0x00000028
+#define NV50TSC_1_0_WRAPT_MIRROR_CLAMP_TO_BORDER                 0x00000030
+#define NV50TSC_1_0_WRAPT_MIRROR_CLAMP                           0x00000038
+#define NV50TSC_1_0_WRAPR_MASK                                   0x000001c0
+#define NV50TSC_1_0_WRAPR_REPEAT                                 0x00000000
+#define NV50TSC_1_0_WRAPR_MIRROR_REPEAT                          0x00000040
+#define NV50TSC_1_0_WRAPR_CLAMP_TO_EDGE                          0x00000080
+#define NV50TSC_1_0_WRAPR_CLAMP_TO_BORDER                        0x000000c0
+#define NV50TSC_1_0_WRAPR_CLAMP                                  0x00000100
+#define NV50TSC_1_0_WRAPR_MIRROR_CLAMP_TO_EDGE                   0x00000140
+#define NV50TSC_1_0_WRAPR_MIRROR_CLAMP_TO_BORDER                 0x00000180
+#define NV50TSC_1_0_WRAPR_MIRROR_CLAMP                           0x000001c0
+
+#define NV50TSC_1_1_MAGF_MASK                                    0x00000003
+#define NV50TSC_1_1_MAGF_NEAREST                                 0x00000001
+#define NV50TSC_1_1_MAGF_LINEAR                                  0x00000002
+#define NV50TSC_1_1_MINF_MASK                                    0x00000030
+#define NV50TSC_1_1_MINF_NEAREST                                 0x00000010
+#define NV50TSC_1_1_MINF_LINEAR                                  0x00000020
+#define NV50TSC_1_1_MIPF_MASK                                    0x000000c0
+#define NV50TSC_1_1_MIPF_NONE                                    0x00000040
+#define NV50TSC_1_1_MIPF_NEAREST                                 0x00000080
+#define NV50TSC_1_1_MIPF_LINEAR                                  0x000000c0
+
+#define NV50TSC_1_2_UNKNOWN_MASK                                 0xffffffff
+
+#define NV50TSC_1_3_UNKNOWN_MASK                                 0xffffffff
+
+#define NV50TSC_1_4_UNKNOWN_MASK                                 0xffffffff
+
+#define NV50TSC_1_5_UNKNOWN_MASK                                 0xffffffff
+
+#define NV50TSC_1_6_UNKNOWN_MASK                                 0xffffffff
+
+#define NV50TSC_1_7_UNKNOWN_MASK                                 0xffffffff
+
+#endif
diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c
new file mode 100644
index 00000000000..584336682e4
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_vbo.c
@@ -0,0 +1,237 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "nv50_context.h"
+
+static INLINE unsigned
+nv50_prim(unsigned mode)
+{
+	switch (mode) {
+	case PIPE_PRIM_POINTS: return NV50TCL_VERTEX_BEGIN_POINTS;
+	case PIPE_PRIM_LINES: return NV50TCL_VERTEX_BEGIN_LINES;
+	case PIPE_PRIM_LINE_LOOP: return NV50TCL_VERTEX_BEGIN_LINE_LOOP;
+	case PIPE_PRIM_LINE_STRIP: return NV50TCL_VERTEX_BEGIN_LINE_STRIP;
+	case PIPE_PRIM_TRIANGLES: return NV50TCL_VERTEX_BEGIN_TRIANGLES;
+	case PIPE_PRIM_TRIANGLE_STRIP:
+		return NV50TCL_VERTEX_BEGIN_TRIANGLE_STRIP;
+	case PIPE_PRIM_TRIANGLE_FAN: return NV50TCL_VERTEX_BEGIN_TRIANGLE_FAN;
+	case PIPE_PRIM_QUADS: return NV50TCL_VERTEX_BEGIN_QUADS;
+	case PIPE_PRIM_QUAD_STRIP: return NV50TCL_VERTEX_BEGIN_QUAD_STRIP;
+	case PIPE_PRIM_POLYGON: return NV50TCL_VERTEX_BEGIN_POLYGON;
+	default:
+		break;
+	}
+
+	NOUVEAU_ERR("invalid primitive type %d\n", mode);
+	return NV50TCL_VERTEX_BEGIN_POINTS;
+}
+
+boolean
+nv50_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start,
+		 unsigned count)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50_state_validate(nv50);
+
+	BEGIN_RING(tesla, 0x142c, 1);
+	OUT_RING  (0);
+	BEGIN_RING(tesla, 0x142c, 1);
+	OUT_RING  (0);
+
+	BEGIN_RING(tesla, NV50TCL_VERTEX_BEGIN, 1);
+	OUT_RING  (nv50_prim(mode));
+	BEGIN_RING(tesla, NV50TCL_VERTEX_BUFFER_FIRST, 2);
+	OUT_RING  (start);
+	OUT_RING  (count);
+	BEGIN_RING(tesla, NV50TCL_VERTEX_END, 1);
+	OUT_RING  (0);
+
+	pipe->flush(pipe, 0, NULL);
+	return TRUE;
+}
+
+static INLINE void
+nv50_draw_elements_inline_u08(struct nv50_context *nv50, uint8_t *map,
+			      unsigned start, unsigned count)
+{
+	map += start;
+
+	if (count & 1) {
+		BEGIN_RING(tesla, 0x15e8, 1);
+		OUT_RING  (map[0]);
+		map++;
+		count--;
+	}
+
+	while (count) {
+		unsigned nr = count > 2046 ? 2046 : count;
+		int i;
+
+		BEGIN_RING(tesla, 0x400015f0, nr >> 1);
+		for (i = 0; i < nr; i += 2)
+			OUT_RING  ((map[1] << 16) | map[0]);
+
+		count -= nr;
+		map += nr;
+	}
+}
+
+static INLINE void
+nv50_draw_elements_inline_u16(struct nv50_context *nv50, uint16_t *map,
+			      unsigned start, unsigned count)
+{
+	map += start;
+
+	if (count & 1) {
+		BEGIN_RING(tesla, 0x15e8, 1);
+		OUT_RING  (map[0]);
+		map++;
+		count--;
+	}
+
+	while (count) {
+		unsigned nr = count > 2046 ? 2046 : count;
+		int i;
+
+		BEGIN_RING(tesla, 0x400015f0, nr >> 1);
+		for (i = 0; i < nr; i += 2)
+			OUT_RING  ((map[1] << 16) | map[0]);
+
+		count -= nr;
+		map += nr;
+	}
+}
+
+static INLINE void
+nv50_draw_elements_inline_u32(struct nv50_context *nv50, uint8_t *map,
+			      unsigned start, unsigned count)
+{
+	map += start;
+
+	while (count) {
+		unsigned nr = count > 2047 ? 2047 : count;
+
+		BEGIN_RING(tesla, 0x400015e8, nr);
+		OUT_RINGp (map, nr);
+
+		count -= nr;
+		map += nr;
+	}
+}
+
+boolean
+nv50_draw_elements(struct pipe_context *pipe,
+		   struct pipe_buffer *indexBuffer, unsigned indexSize,
+		   unsigned mode, unsigned start, unsigned count)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+	void *map = ws->buffer_map(ws, indexBuffer, PIPE_BUFFER_USAGE_CPU_READ);
+
+	nv50_state_validate(nv50);
+
+	BEGIN_RING(tesla, 0x142c, 1);
+	OUT_RING  (0);
+	BEGIN_RING(tesla, 0x142c, 1);
+	OUT_RING  (0);
+
+	BEGIN_RING(tesla, NV50TCL_VERTEX_BEGIN, 1);
+	OUT_RING  (nv50_prim(mode));
+	switch (indexSize) {
+	case 1:
+		nv50_draw_elements_inline_u08(nv50, map, start, count);
+		break;
+	case 2:
+		nv50_draw_elements_inline_u16(nv50, map, start, count);
+		break;
+	case 4:
+		nv50_draw_elements_inline_u32(nv50, map, start, count);
+		break;
+	default:
+		assert(0);
+	}
+	BEGIN_RING(tesla, NV50TCL_VERTEX_END, 1);
+	OUT_RING  (0);
+
+	pipe->flush(pipe, 0, NULL);
+	return TRUE;
+}
+
+void
+nv50_vbo_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *vtxbuf, *vtxfmt;
+	int i, vpi = 0;
+
+	vtxbuf = so_new(nv50->vtxelt_nr * 4, nv50->vtxelt_nr * 2);
+	vtxfmt = so_new(nv50->vtxelt_nr + 1, 0);
+	so_method(vtxfmt, tesla, 0x1ac0, nv50->vtxelt_nr);
+
+	for (i = 0; i < nv50->vtxelt_nr; i++) {
+		struct pipe_vertex_element *ve = &nv50->vtxelt[i];
+		struct pipe_vertex_buffer *vb =
+			&nv50->vtxbuf[ve->vertex_buffer_index];
+
+		switch (ve->src_format) {
+		case PIPE_FORMAT_R32G32B32A32_FLOAT:
+			so_data(vtxfmt, 0x7e080000 | i);
+			break;
+		case PIPE_FORMAT_R32G32B32_FLOAT:
+			so_data(vtxfmt, 0x7e100000 | i);
+			break;
+		case PIPE_FORMAT_R32G32_FLOAT:
+			so_data(vtxfmt, 0x7e200000 | i);
+			break;
+		case PIPE_FORMAT_R32_FLOAT:
+			so_data(vtxfmt, 0x7e900000 | i);
+			break;
+		case PIPE_FORMAT_R8G8B8A8_UNORM:
+			so_data(vtxfmt, 0x24500000 | i);
+			break;
+		default:
+		{
+			NOUVEAU_ERR("invalid vbo format %s\n",
+				    pf_name(ve->src_format));
+			assert(0);
+			return;
+		}
+		}
+
+		so_method(vtxbuf, tesla, 0x900 + (i * 16), 3);
+		so_data  (vtxbuf, 0x20000000 | vb->pitch);
+		so_reloc (vtxbuf, vb->buffer, vb->buffer_offset +
+			  ve->src_offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
+			  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+		so_reloc (vtxbuf, vb->buffer, vb->buffer_offset +
+			  ve->src_offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
+			  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	}
+
+	so_ref (vtxfmt, &nv50->state.vtxfmt);
+	so_ref (vtxbuf, &nv50->state.vtxbuf);
+}
+
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index 701ee4c72f2..453b0373f0f 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -39,11 +39,19 @@
 #include "tgsi/tgsi_exec.h"
 #include "tgsi/tgsi_parse.h"
 
-struct sp_exec_fragment_shader {
+struct sp_exec_fragment_shader
+{
    struct sp_fragment_shader base;
 };
 
 
+/** cast wrapper */
+static INLINE struct sp_exec_fragment_shader *
+sp_exec_fragment_shader(const struct sp_fragment_shader *base)
+{
+   return (struct sp_exec_fragment_shader *) base;
+}
+
 
 /**
  * Compute quad X,Y,Z,W for the four fragments in a quad.
@@ -84,12 +92,18 @@ sp_setup_pos_vector(const struct tgsi_interp_coef *coef,
 static void
 exec_prepare( const struct sp_fragment_shader *base,
 	      struct tgsi_exec_machine *machine,
-	      struct tgsi_sampler *samplers )
+	      struct tgsi_sampler **samplers )
 {
-   tgsi_exec_machine_bind_shader( machine,
-				  base->shader.tokens,
-				  PIPE_MAX_SAMPLERS,
-				  samplers );
+   /*
+    * Bind tokens/shader to the interpreter's machine state.
+    * Avoid redundant binding.
+    */
+   if (machine->Tokens != base->shader.tokens) {
+      tgsi_exec_machine_bind_shader( machine,
+                                     base->shader.tokens,
+                                     PIPE_MAX_SAMPLERS,
+                                     samplers );
+   }
 }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
index 50eb2c07bcb..9a273c87643 100644
--- a/src/gallium/drivers/softpipe/sp_fs_sse.c
+++ b/src/gallium/drivers/softpipe/sp_fs_sse.c
@@ -40,7 +40,7 @@
 #include "tgsi/tgsi_sse2.h"
 
 
-#ifdef PIPE_ARCH_X86
+#if defined(PIPE_ARCH_X86)
 
 #include "rtasm/rtasm_x86sse.h"
 
@@ -69,7 +69,7 @@ struct sp_sse_fragment_shader {
 static void
 fs_sse_prepare( const struct sp_fragment_shader *base,
 		struct tgsi_exec_machine *machine,
-		struct tgsi_sampler *samplers )
+		struct tgsi_sampler **samplers )
 {
 }
 
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
index 425e13cd283..9cd5784e5bc 100644
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.c
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
@@ -26,10 +26,10 @@
  **************************************************************************/
 
 /**
- * Post-transform vertex buffering.  This is an optional part of the
- * softpipe rendering pipeline.
- * Probably not desired in general, but useful for testing/debuggin.
- * Enabled/Disabled with SP_VBUF env var.
+ * Interface between 'draw' module's output and the softpipe rasterizer/setup
+ * code.  When the 'draw' module has finished filling a vertex buffer, the
+ * draw_arrays() functions below will be called.  Loop over the vertices and
+ * call the point/line/tri setup functions.
  *
  * Authors
  *  Brian Paul
@@ -131,21 +131,23 @@ static INLINE cptrf4 get_vert( const void *vertex_buffer,
 }
 
 
+/**
+ * draw elements / indexed primitives
+ */
 static void
 sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
 {
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
    struct softpipe_context *softpipe = cvbr->softpipe;
-   unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
-   unsigned i;
+   const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
    const void *vertex_buffer = cvbr->vertex_buffer;
+   unsigned i;
 
    /* XXX: break this dependency - make setup_context live under
     * softpipe, rename the old "setup" draw stage to something else.
     */
    struct draw_stage *setup = softpipe->setup;
-   struct setup_context *setup_ctx = sp_draw_setup_context(softpipe->setup);
-
+   struct setup_context *setup_ctx = sp_draw_setup_context(setup);
 
    switch (cvbr->prim) {
    case PIPE_PRIM_POINTS:
@@ -258,13 +260,16 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
 {
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
    struct softpipe_context *softpipe = cvbr->softpipe;
-   struct draw_stage *setup = softpipe->setup;
-   const void *vertex_buffer = NULL;
    const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
+   const void *vertex_buffer =
+      (void *) get_vert(cvbr->vertex_buffer, start, stride);
    unsigned i;
-   struct setup_context *setup_ctx = sp_draw_setup_context(setup);
 
-   vertex_buffer = (void *)get_vert(cvbr->vertex_buffer, start, stride);
+   /* XXX: break this dependency - make setup_context live under
+    * softpipe, rename the old "setup" draw stage to something else.
+    */
+   struct draw_stage *setup = softpipe->setup;
+   struct setup_context *setup_ctx = sp_draw_setup_context(setup);
 
    switch (cvbr->prim) {
    case PIPE_PRIM_POINTS:
diff --git a/src/gallium/drivers/softpipe/sp_quad_fs.c b/src/gallium/drivers/softpipe/sp_quad_fs.c
index 1f0cb3e0355..730fa0cf49f 100644
--- a/src/gallium/drivers/softpipe/sp_quad_fs.c
+++ b/src/gallium/drivers/softpipe/sp_quad_fs.c
@@ -50,8 +50,9 @@
 
 struct quad_shade_stage
 {
-   struct quad_stage stage;
-   struct tgsi_sampler samplers[PIPE_MAX_SAMPLERS];
+   struct quad_stage stage;  /**< base class */
+   struct sp_shader_sampler samplers[PIPE_MAX_SAMPLERS];
+   struct sp_shader_sampler *samplers_list[PIPE_MAX_SAMPLERS];
    struct tgsi_exec_machine machine;
    struct tgsi_exec_vector *inputs, *outputs;
 };
@@ -147,18 +148,10 @@ static void shade_begin(struct quad_stage *qs)
 {
    struct quad_shade_stage *qss = quad_shade_stage(qs);
    struct softpipe_context *softpipe = qs->softpipe;
-   unsigned i;
-   unsigned num = MAX2(softpipe->num_textures, softpipe->num_samplers);
-
-   /* set TGSI sampler state that varies */
-   for (i = 0; i < num; i++) {
-      qss->samplers[i].state = softpipe->sampler[i];
-      qss->samplers[i].texture = softpipe->texture[i];
-   }
 
    softpipe->fs->prepare( softpipe->fs, 
 			  &qss->machine,
-			  qss->samplers );
+			  qss->samplers_list );
 
    qs->next->begin(qs->next);
 }
@@ -191,12 +184,14 @@ struct quad_stage *sp_quad_shade_stage( struct softpipe_context *softpipe )
    qss->stage.run = shade_quad;
    qss->stage.destroy = shade_destroy;
 
-   /* set TGSI sampler state that's constant */
+   /* setup TGSI sampler state */
    for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
       assert(softpipe->tex_cache[i]);
-      qss->samplers[i].get_samples = sp_get_samples;
-      qss->samplers[i].pipe = &softpipe->pipe;
+      qss->samplers[i].base.get_samples = sp_get_samples;
+      qss->samplers[i].unit = i;
+      qss->samplers[i].sp = softpipe;
       qss->samplers[i].cache = softpipe->tex_cache[i];
+      qss->samplers_list[i] = &qss->samplers[i];
    }
 
    tgsi_exec_machine_init( &qss->machine );
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index 49250ec084c..b66caf95078 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -35,6 +35,7 @@
 #include "sp_context.h"
 #include "sp_headers.h"
 #include "sp_surface.h"
+#include "sp_texture.h"
 #include "sp_tex_sample.h"
 #include "sp_tile_cache.h"
 #include "pipe/p_context.h"
@@ -463,7 +464,8 @@ choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
  * This is only done for fragment shaders, not vertex shaders.
  */
 static float
-compute_lambda(struct tgsi_sampler *sampler,
+compute_lambda(const struct pipe_texture *tex,
+               const struct pipe_sampler_state *sampler,
                const float s[QUAD_SIZE],
                const float t[QUAD_SIZE],
                const float p[QUAD_SIZE],
@@ -471,7 +473,7 @@ compute_lambda(struct tgsi_sampler *sampler,
 {
    float rho, lambda;
 
-   assert(sampler->state->normalized_coords);
+   assert(sampler->normalized_coords);
 
    assert(s);
    {
@@ -479,7 +481,7 @@ compute_lambda(struct tgsi_sampler *sampler,
       float dsdy = s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT];
       dsdx = fabsf(dsdx);
       dsdy = fabsf(dsdy);
-      rho = MAX2(dsdx, dsdy) * sampler->texture->width[0];
+      rho = MAX2(dsdx, dsdy) * tex->width[0];
    }
    if (t) {
       float dtdx = t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT];
@@ -487,7 +489,7 @@ compute_lambda(struct tgsi_sampler *sampler,
       float max;
       dtdx = fabsf(dtdx);
       dtdy = fabsf(dtdy);
-      max = MAX2(dtdx, dtdy) * sampler->texture->height[0];
+      max = MAX2(dtdx, dtdy) * tex->height[0];
       rho = MAX2(rho, max);
    }
    if (p) {
@@ -496,13 +498,13 @@ compute_lambda(struct tgsi_sampler *sampler,
       float max;
       dpdx = fabsf(dpdx);
       dpdy = fabsf(dpdy);
-      max = MAX2(dpdx, dpdy) * sampler->texture->depth[0];
+      max = MAX2(dpdx, dpdy) * tex->depth[0];
       rho = MAX2(rho, max);
    }
 
    lambda = util_fast_log2(rho);
-   lambda += lodbias + sampler->state->lod_bias;
-   lambda = CLAMP(lambda, sampler->state->min_lod, sampler->state->max_lod);
+   lambda += lodbias + sampler->lod_bias;
+   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
 
    return lambda;
 }
@@ -516,7 +518,8 @@ compute_lambda(struct tgsi_sampler *sampler,
  * 4. Return image filter to use within mipmap images
  */
 static void
-choose_mipmap_levels(struct tgsi_sampler *sampler,
+choose_mipmap_levels(const struct pipe_texture *texture,
+                     const struct pipe_sampler_state *sampler,
                      const float s[QUAD_SIZE],
                      const float t[QUAD_SIZE],
                      const float p[QUAD_SIZE],
@@ -524,25 +527,26 @@ choose_mipmap_levels(struct tgsi_sampler *sampler,
                      unsigned *level0, unsigned *level1, float *levelBlend,
                      unsigned *imgFilter)
 {
-   if (sampler->state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) {
+
+   if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) {
       /* no mipmap selection needed */
-      *level0 = *level1 = CLAMP((int) sampler->state->min_lod,
-                                0, (int) sampler->texture->last_level);
+      *level0 = *level1 = CLAMP((int) sampler->min_lod,
+                                0, (int) texture->last_level);
 
-      if (sampler->state->min_img_filter != sampler->state->mag_img_filter) {
+      if (sampler->min_img_filter != sampler->mag_img_filter) {
          /* non-mipmapped texture, but still need to determine if doing
           * minification or magnification.
           */
-         float lambda = compute_lambda(sampler, s, t, p, lodbias);
+         float lambda = compute_lambda(texture, sampler, s, t, p, lodbias);
          if (lambda <= 0.0) {
-            *imgFilter = sampler->state->mag_img_filter;
+            *imgFilter = sampler->mag_img_filter;
          }
          else {
-            *imgFilter = sampler->state->min_img_filter;
+            *imgFilter = sampler->min_img_filter;
          }
       }
       else {
-         *imgFilter = sampler->state->mag_img_filter;
+         *imgFilter = sampler->mag_img_filter;
       }
    }
    else {
@@ -550,32 +554,32 @@ choose_mipmap_levels(struct tgsi_sampler *sampler,
 
       if (1)
          /* fragment shader */
-         lambda = compute_lambda(sampler, s, t, p, lodbias);
+         lambda = compute_lambda(texture, sampler, s, t, p, lodbias);
       else
          /* vertex shader */
          lambda = lodbias; /* not really a bias, but absolute LOD */
 
       if (lambda <= 0.0) { /* XXX threshold depends on the filter */
          /* magnifying */
-         *imgFilter = sampler->state->mag_img_filter;
+         *imgFilter = sampler->mag_img_filter;
          *level0 = *level1 = 0;
       }
       else {
          /* minifying */
-         *imgFilter = sampler->state->min_img_filter;
+         *imgFilter = sampler->min_img_filter;
 
          /* choose mipmap level(s) and compute the blend factor between them */
-         if (sampler->state->min_mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
+         if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
             /* Nearest mipmap level */
             const int lvl = (int) (lambda + 0.5);
             *level0 =
-            *level1 = CLAMP(lvl, 0, (int) sampler->texture->last_level);
+            *level1 = CLAMP(lvl, 0, (int) texture->last_level);
          }
          else {
             /* Linear interpolation between mipmap levels */
             const int lvl = (int) lambda;
-            *level0 = CLAMP(lvl,     0, (int) sampler->texture->last_level);
-            *level1 = CLAMP(lvl + 1, 0, (int) sampler->texture->last_level);
+            *level0 = CLAMP(lvl,     0, (int) texture->last_level);
+            *level1 = CLAMP(lvl + 1, 0, (int) texture->last_level);
             *levelBlend = FRAC(lambda);  /* blending weight between levels */
          }
       }
@@ -585,6 +589,7 @@ choose_mipmap_levels(struct tgsi_sampler *sampler,
 
 /**
  * Get a texel from a texture, using the texture tile cache.
+ * Called by the TGSI interpreter.
  *
  * \param face  the cube face in 0..5
  * \param level  the mipmap level
@@ -598,23 +603,29 @@ choose_mipmap_levels(struct tgsi_sampler *sampler,
  * sp_get_cached_tile_tex() function.  Also, get 4 texels instead of 1...
  */
 static void
-get_texel(struct tgsi_sampler *sampler,
+get_texel(struct tgsi_sampler *tgsi_sampler,
           unsigned face, unsigned level, int x, int y, int z,
           float rgba[NUM_CHANNELS][QUAD_SIZE], unsigned j)
 {
-   if (x < 0 || x >= (int) sampler->texture->width[level] ||
-       y < 0 || y >= (int) sampler->texture->height[level] ||
-       z < 0 || z >= (int) sampler->texture->depth[level]) {
-      rgba[0][j] = sampler->state->border_color[0];
-      rgba[1][j] = sampler->state->border_color[1];
-      rgba[2][j] = sampler->state->border_color[2];
-      rgba[3][j] = sampler->state->border_color[3];
+   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
+   const struct softpipe_context *sp = samp->sp;
+   const uint unit = samp->unit;
+   const struct pipe_texture *texture = sp->texture[unit];
+   const struct pipe_sampler_state *sampler = sp->sampler[unit];
+
+   if (x < 0 || x >= (int) texture->width[level] ||
+       y < 0 || y >= (int) texture->height[level] ||
+       z < 0 || z >= (int) texture->depth[level]) {
+      rgba[0][j] = sampler->border_color[0];
+      rgba[1][j] = sampler->border_color[1];
+      rgba[2][j] = sampler->border_color[2];
+      rgba[3][j] = sampler->border_color[3];
    }
    else {
       const int tx = x % TILE_SIZE;
       const int ty = y % TILE_SIZE;
       const struct softpipe_cached_tile *tile
-         = sp_get_cached_tile_tex(sampler->pipe, sampler->cache,
+         = sp_get_cached_tile_tex(samp->sp, samp->cache,
                                   x, y, z, face, level);
       rgba[0][j] = tile->data.color[ty][tx][0];
       rgba[1][j] = tile->data.color[ty][tx][1];
@@ -624,7 +635,7 @@ get_texel(struct tgsi_sampler *sampler,
       {
          debug_printf("Get texel %f %f %f %f from %s\n",
                       rgba[0][j], rgba[1][j], rgba[2][j], rgba[3][j],
-                      pf_name(sampler->texture->format));
+                      pf_name(texture->format));
       }
    }
 }
@@ -682,7 +693,7 @@ shadow_compare(uint compare_func,
  * Could probably extend for 3D...
  */
 static void
-sp_get_samples_2d_common(struct tgsi_sampler *sampler,
+sp_get_samples_2d_common(struct tgsi_sampler *tgsi_sampler,
                          const float s[QUAD_SIZE],
                          const float t[QUAD_SIZE],
                          const float p[QUAD_SIZE],
@@ -690,28 +701,33 @@ sp_get_samples_2d_common(struct tgsi_sampler *sampler,
                          float rgba[NUM_CHANNELS][QUAD_SIZE],
                          const unsigned faces[4])
 {
-   const uint compare_func = sampler->state->compare_func;
+   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
+   const struct softpipe_context *sp = samp->sp;
+   const uint unit = samp->unit;
+   const struct pipe_texture *texture = sp->texture[unit];
+   const struct pipe_sampler_state *sampler = sp->sampler[unit];
+   const uint compare_func = sampler->compare_func;
    unsigned level0, level1, j, imgFilter;
    int width, height;
    float levelBlend;
 
-   choose_mipmap_levels(sampler, s, t, p, lodbias,
+   choose_mipmap_levels(texture, sampler, s, t, p, lodbias,
                         &level0, &level1, &levelBlend, &imgFilter);
 
-   assert(sampler->state->normalized_coords);
+   assert(sampler->normalized_coords);
 
-   width = sampler->texture->width[level0];
-   height = sampler->texture->height[level0];
+   width = texture->width[level0];
+   height = texture->height[level0];
 
    assert(width > 0);
 
    switch (imgFilter) {
    case PIPE_TEX_FILTER_NEAREST:
       for (j = 0; j < QUAD_SIZE; j++) {
-         int x = nearest_texcoord(sampler->state->wrap_s, s[j], width);
-         int y = nearest_texcoord(sampler->state->wrap_t, t[j], height);
-         get_texel(sampler, faces[j], level0, x, y, 0, rgba, j);
-         if (sampler->state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+         int x = nearest_texcoord(sampler->wrap_s, s[j], width);
+         int y = nearest_texcoord(sampler->wrap_t, t[j], height);
+         get_texel(tgsi_sampler, faces[j], level0, x, y, 0, rgba, j);
+         if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
             shadow_compare(compare_func, rgba, p, j);
          }
 
@@ -721,8 +737,8 @@ sp_get_samples_2d_common(struct tgsi_sampler *sampler,
             unsigned c;
             x = x / 2;
             y = y / 2;
-            get_texel(sampler, faces[j], level1, x, y, 0, rgba2, j);
-            if (sampler->state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
+            get_texel(tgsi_sampler, faces[j], level1, x, y, 0, rgba2, j);
+            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
                shadow_compare(compare_func, rgba2, p, j);
             }
 
@@ -737,13 +753,13 @@ sp_get_samples_2d_common(struct tgsi_sampler *sampler,
       for (j = 0; j < QUAD_SIZE; j++) {
          float tx[4][4], a, b;
          int x0, y0, x1, y1, c;
-         linear_texcoord(sampler->state->wrap_s, s[j], width,  &x0, &x1, &a);
-         linear_texcoord(sampler->state->wrap_t, t[j], height, &y0, &y1, &b);
-         get_texel(sampler, faces[j], level0, x0, y0, 0, tx, 0);
-         get_texel(sampler, faces[j], level0, x1, y0, 0, tx, 1);
-         get_texel(sampler, faces[j], level0, x0, y1, 0, tx, 2);
-         get_texel(sampler, faces[j], level0, x1, y1, 0, tx, 3);
-         if (sampler->state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+         linear_texcoord(sampler->wrap_s, s[j], width,  &x0, &x1, &a);
+         linear_texcoord(sampler->wrap_t, t[j], height, &y0, &y1, &b);
+         get_texel(tgsi_sampler, faces[j], level0, x0, y0, 0, tx, 0);
+         get_texel(tgsi_sampler, faces[j], level0, x1, y0, 0, tx, 1);
+         get_texel(tgsi_sampler, faces[j], level0, x0, y1, 0, tx, 2);
+         get_texel(tgsi_sampler, faces[j], level0, x1, y1, 0, tx, 3);
+         if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
             shadow_compare(compare_func, tx, p, 0);
             shadow_compare(compare_func, tx, p, 1);
             shadow_compare(compare_func, tx, p, 2);
@@ -761,11 +777,11 @@ sp_get_samples_2d_common(struct tgsi_sampler *sampler,
             y0 = y0 / 2;
             x1 = x1 / 2;
             y1 = y1 / 2;
-            get_texel(sampler, faces[j], level1, x0, y0, 0, tx, 0);
-            get_texel(sampler, faces[j], level1, x1, y0, 0, tx, 1);
-            get_texel(sampler, faces[j], level1, x0, y1, 0, tx, 2);
-            get_texel(sampler, faces[j], level1, x1, y1, 0, tx, 3);
-            if (sampler->state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
+            get_texel(tgsi_sampler, faces[j], level1, x0, y0, 0, tx, 0);
+            get_texel(tgsi_sampler, faces[j], level1, x1, y0, 0, tx, 1);
+            get_texel(tgsi_sampler, faces[j], level1, x0, y1, 0, tx, 2);
+            get_texel(tgsi_sampler, faces[j], level1, x1, y1, 0, tx, 3);
+            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
                shadow_compare(compare_func, tx, p, 0);
                shadow_compare(compare_func, tx, p, 1);
                shadow_compare(compare_func, tx, p, 2);
@@ -817,27 +833,32 @@ sp_get_samples_2d(struct tgsi_sampler *sampler,
 
 
 static void
-sp_get_samples_3d(struct tgsi_sampler *sampler,
+sp_get_samples_3d(struct tgsi_sampler *tgsi_sampler,
                   const float s[QUAD_SIZE],
                   const float t[QUAD_SIZE],
                   const float p[QUAD_SIZE],
                   float lodbias,
                   float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
+   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
+   const struct softpipe_context *sp = samp->sp;
+   const uint unit = samp->unit;
+   const struct pipe_texture *texture = sp->texture[unit];
+   const struct pipe_sampler_state *sampler = sp->sampler[unit];
    /* get/map pipe_surfaces corresponding to 3D tex slices */
    unsigned level0, level1, j, imgFilter;
    int width, height, depth;
    float levelBlend;
    const uint face = 0;
 
-   choose_mipmap_levels(sampler, s, t, p, lodbias,
+   choose_mipmap_levels(texture, sampler, s, t, p, lodbias,
                         &level0, &level1, &levelBlend, &imgFilter);
 
-   assert(sampler->state->normalized_coords);
+   assert(sampler->normalized_coords);
 
-   width = sampler->texture->width[level0];
-   height = sampler->texture->height[level0];
-   depth = sampler->texture->depth[level0];
+   width = texture->width[level0];
+   height = texture->height[level0];
+   depth = texture->depth[level0];
 
    assert(width > 0);
    assert(height > 0);
@@ -846,10 +867,10 @@ sp_get_samples_3d(struct tgsi_sampler *sampler,
    switch (imgFilter) {
    case PIPE_TEX_FILTER_NEAREST:
       for (j = 0; j < QUAD_SIZE; j++) {
-         int x = nearest_texcoord(sampler->state->wrap_s, s[j], width);
-         int y = nearest_texcoord(sampler->state->wrap_t, t[j], height);
-         int z = nearest_texcoord(sampler->state->wrap_r, p[j], depth);
-         get_texel(sampler, face, level0, x, y, z, rgba, j);
+         int x = nearest_texcoord(sampler->wrap_s, s[j], width);
+         int y = nearest_texcoord(sampler->wrap_t, t[j], height);
+         int z = nearest_texcoord(sampler->wrap_r, p[j], depth);
+         get_texel(tgsi_sampler, face, level0, x, y, z, rgba, j);
 
          if (level0 != level1) {
             /* get texels from second mipmap level and blend */
@@ -858,7 +879,7 @@ sp_get_samples_3d(struct tgsi_sampler *sampler,
             x /= 2;
             y /= 2;
             z /= 2;
-            get_texel(sampler, face, level1, x, y, z, rgba2, j);
+            get_texel(tgsi_sampler, face, level1, x, y, z, rgba2, j);
             for (c = 0; c < NUM_CHANNELS; c++) {
                rgba[c][j] = LERP(levelBlend, rgba2[c][j], rgba[c][j]);
             }
@@ -871,17 +892,17 @@ sp_get_samples_3d(struct tgsi_sampler *sampler,
          float texel0[4][4], texel1[4][4];
          float xw, yw, zw; /* interpolation weights */
          int x0, x1, y0, y1, z0, z1, c;
-         linear_texcoord(sampler->state->wrap_s, s[j], width,  &x0, &x1, &xw);
-         linear_texcoord(sampler->state->wrap_t, t[j], height, &y0, &y1, &yw);
-         linear_texcoord(sampler->state->wrap_r, p[j], depth,  &z0, &z1, &zw);
-         get_texel(sampler, face, level0, x0, y0, z0, texel0, 0);
-         get_texel(sampler, face, level0, x1, y0, z0, texel0, 1);
-         get_texel(sampler, face, level0, x0, y1, z0, texel0, 2);
-         get_texel(sampler, face, level0, x1, y1, z0, texel0, 3);
-         get_texel(sampler, face, level0, x0, y0, z1, texel1, 0);
-         get_texel(sampler, face, level0, x1, y0, z1, texel1, 1);
-         get_texel(sampler, face, level0, x0, y1, z1, texel1, 2);
-         get_texel(sampler, face, level0, x1, y1, z1, texel1, 3);
+         linear_texcoord(sampler->wrap_s, s[j], width,  &x0, &x1, &xw);
+         linear_texcoord(sampler->wrap_t, t[j], height, &y0, &y1, &yw);
+         linear_texcoord(sampler->wrap_r, p[j], depth,  &z0, &z1, &zw);
+         get_texel(tgsi_sampler, face, level0, x0, y0, z0, texel0, 0);
+         get_texel(tgsi_sampler, face, level0, x1, y0, z0, texel0, 1);
+         get_texel(tgsi_sampler, face, level0, x0, y1, z0, texel0, 2);
+         get_texel(tgsi_sampler, face, level0, x1, y1, z0, texel0, 3);
+         get_texel(tgsi_sampler, face, level0, x0, y0, z1, texel1, 0);
+         get_texel(tgsi_sampler, face, level0, x1, y0, z1, texel1, 1);
+         get_texel(tgsi_sampler, face, level0, x0, y1, z1, texel1, 2);
+         get_texel(tgsi_sampler, face, level0, x1, y1, z1, texel1, 3);
 
          /* 3D lerp */
          for (c = 0; c < 4; c++) {
@@ -904,14 +925,14 @@ sp_get_samples_3d(struct tgsi_sampler *sampler,
             x1 /= 2;
             y1 /= 2;
             z1 /= 2;
-            get_texel(sampler, face, level1, x0, y0, z0, texel0, 0);
-            get_texel(sampler, face, level1, x1, y0, z0, texel0, 1);
-            get_texel(sampler, face, level1, x0, y1, z0, texel0, 2);
-            get_texel(sampler, face, level1, x1, y1, z0, texel0, 3);
-            get_texel(sampler, face, level1, x0, y0, z1, texel1, 0);
-            get_texel(sampler, face, level1, x1, y0, z1, texel1, 1);
-            get_texel(sampler, face, level1, x0, y1, z1, texel1, 2);
-            get_texel(sampler, face, level1, x1, y1, z1, texel1, 3);
+            get_texel(tgsi_sampler, face, level1, x0, y0, z0, texel0, 0);
+            get_texel(tgsi_sampler, face, level1, x1, y0, z0, texel0, 1);
+            get_texel(tgsi_sampler, face, level1, x0, y1, z0, texel0, 2);
+            get_texel(tgsi_sampler, face, level1, x1, y1, z0, texel0, 3);
+            get_texel(tgsi_sampler, face, level1, x0, y0, z1, texel1, 0);
+            get_texel(tgsi_sampler, face, level1, x1, y0, z1, texel1, 1);
+            get_texel(tgsi_sampler, face, level1, x0, y1, z1, texel1, 2);
+            get_texel(tgsi_sampler, face, level1, x1, y1, z1, texel1, 3);
 
             /* 3D lerp */
             for (c = 0; c < 4; c++) {
@@ -956,38 +977,43 @@ sp_get_samples_cube(struct tgsi_sampler *sampler,
 
 
 static void
-sp_get_samples_rect(struct tgsi_sampler *sampler,
+sp_get_samples_rect(struct tgsi_sampler *tgsi_sampler,
                     const float s[QUAD_SIZE],
                     const float t[QUAD_SIZE],
                     const float p[QUAD_SIZE],
                     float lodbias,
                     float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
+   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
+   const struct softpipe_context *sp = samp->sp;
+   const uint unit = samp->unit;
+   const struct pipe_texture *texture = sp->texture[unit];
+   const struct pipe_sampler_state *sampler = sp->sampler[unit];
    //sp_get_samples_2d_common(sampler, s, t, p, lodbias, rgba, faces);
    static const uint face = 0;
-   const uint compare_func = sampler->state->compare_func;
+   const uint compare_func = sampler->compare_func;
    unsigned level0, level1, j, imgFilter;
    int width, height;
    float levelBlend;
 
-   choose_mipmap_levels(sampler, s, t, p, lodbias,
+   choose_mipmap_levels(texture, sampler, s, t, p, lodbias,
                         &level0, &level1, &levelBlend, &imgFilter);
 
    /* texture RECTS cannot be mipmapped */
    assert(level0 == level1);
 
-   width = sampler->texture->width[level0];
-   height = sampler->texture->height[level0];
+   width = texture->width[level0];
+   height = texture->height[level0];
 
    assert(width > 0);
 
    switch (imgFilter) {
    case PIPE_TEX_FILTER_NEAREST:
       for (j = 0; j < QUAD_SIZE; j++) {
-         int x = nearest_texcoord_unnorm(sampler->state->wrap_s, s[j], width);
-         int y = nearest_texcoord_unnorm(sampler->state->wrap_t, t[j], height);
-         get_texel(sampler, face, level0, x, y, 0, rgba, j);
-         if (sampler->state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+         int x = nearest_texcoord_unnorm(sampler->wrap_s, s[j], width);
+         int y = nearest_texcoord_unnorm(sampler->wrap_t, t[j], height);
+         get_texel(tgsi_sampler, face, level0, x, y, 0, rgba, j);
+         if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
             shadow_compare(compare_func, rgba, p, j);
          }
       }
@@ -997,13 +1023,13 @@ sp_get_samples_rect(struct tgsi_sampler *sampler,
       for (j = 0; j < QUAD_SIZE; j++) {
          float tx[4][4], a, b;
          int x0, y0, x1, y1, c;
-         linear_texcoord_unnorm(sampler->state->wrap_s, s[j], width,  &x0, &x1, &a);
-         linear_texcoord_unnorm(sampler->state->wrap_t, t[j], height, &y0, &y1, &b);
-         get_texel(sampler, face, level0, x0, y0, 0, tx, 0);
-         get_texel(sampler, face, level0, x1, y0, 0, tx, 1);
-         get_texel(sampler, face, level0, x0, y1, 0, tx, 2);
-         get_texel(sampler, face, level0, x1, y1, 0, tx, 3);
-         if (sampler->state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+         linear_texcoord_unnorm(sampler->wrap_s, s[j], width,  &x0, &x1, &a);
+         linear_texcoord_unnorm(sampler->wrap_t, t[j], height, &y0, &y1, &b);
+         get_texel(tgsi_sampler, face, level0, x0, y0, 0, tx, 0);
+         get_texel(tgsi_sampler, face, level0, x1, y0, 0, tx, 1);
+         get_texel(tgsi_sampler, face, level0, x0, y1, 0, tx, 2);
+         get_texel(tgsi_sampler, face, level0, x1, y1, 0, tx, 3);
+         if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
             shadow_compare(compare_func, tx, p, 0);
             shadow_compare(compare_func, tx, p, 1);
             shadow_compare(compare_func, tx, p, 2);
@@ -1036,34 +1062,40 @@ sp_get_samples_rect(struct tgsi_sampler *sampler,
  * a new tgsi_sampler object for each state combo it finds....
  */
 void
-sp_get_samples(struct tgsi_sampler *sampler,
+sp_get_samples(struct tgsi_sampler *tgsi_sampler,
                const float s[QUAD_SIZE],
                const float t[QUAD_SIZE],
                const float p[QUAD_SIZE],
                float lodbias,
                float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
-   if (!sampler->texture)
+   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
+   const struct softpipe_context *sp = samp->sp;
+   const uint unit = samp->unit;
+   const struct pipe_texture *texture = sp->texture[unit];
+   const struct pipe_sampler_state *sampler = sp->sampler[unit];
+
+   if (!texture)
       return;
 
-   switch (sampler->texture->target) {
+   switch (texture->target) {
    case PIPE_TEXTURE_1D:
-      assert(sampler->state->normalized_coords);
-      sp_get_samples_1d(sampler, s, t, p, lodbias, rgba);
+      assert(sampler->normalized_coords);
+      sp_get_samples_1d(tgsi_sampler, s, t, p, lodbias, rgba);
       break;
    case PIPE_TEXTURE_2D:
-      if (sampler->state->normalized_coords)
-         sp_get_samples_2d(sampler, s, t, p, lodbias, rgba);
+      if (sampler->normalized_coords)
+         sp_get_samples_2d(tgsi_sampler, s, t, p, lodbias, rgba);
       else
-         sp_get_samples_rect(sampler, s, t, p, lodbias, rgba);
+         sp_get_samples_rect(tgsi_sampler, s, t, p, lodbias, rgba);
       break;
    case PIPE_TEXTURE_3D:
-      assert(sampler->state->normalized_coords);
-      sp_get_samples_3d(sampler, s, t, p, lodbias, rgba);
+      assert(sampler->normalized_coords);
+      sp_get_samples_3d(tgsi_sampler, s, t, p, lodbias, rgba);
       break;
    case PIPE_TEXTURE_CUBE:
-      assert(sampler->state->normalized_coords);
-      sp_get_samples_cube(sampler, s, t, p, lodbias, rgba);
+      assert(sampler->normalized_coords);
+      sp_get_samples_cube(tgsi_sampler, s, t, p, lodbias, rgba);
       break;
    default:
       assert(0);
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.h b/src/gallium/drivers/softpipe/sp_tex_sample.h
index 404bfd0c365..783169c9397 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.h
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.h
@@ -1,8 +1,56 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
 #ifndef SP_TEX_SAMPLE_H
 #define SP_TEX_SAMPLE_H
 
 
-struct tgsi_sampler;
+#include "tgsi/tgsi_exec.h"
+
+
+/**
+ * Subclass of tgsi_sampler
+ */
+struct sp_shader_sampler
+{
+   struct tgsi_sampler base;  /**< base class */
+
+   uint unit;
+   struct softpipe_context *sp;
+   struct softpipe_tile_cache *cache;
+};
+
+
+
+static INLINE struct sp_shader_sampler *
+sp_shader_sampler(struct tgsi_sampler *sampler)
+{
+   return (struct sp_shader_sampler *) sampler;
+}
 
 
 extern void
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.c b/src/gallium/drivers/softpipe/sp_tile_cache.c
index b50c9845133..78b0efa46d2 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.c
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.c
@@ -494,11 +494,11 @@ tex_cache_pos(int x, int y, int z, int face, int level)
  * Tiles are read-only and indexed with more params.
  */
 const struct softpipe_cached_tile *
-sp_get_cached_tile_tex(struct pipe_context *pipe,
+sp_get_cached_tile_tex(struct softpipe_context *sp,
                        struct softpipe_tile_cache *tc, int x, int y, int z,
                        int face, int level)
 {
-   struct pipe_screen *screen = pipe->screen;
+   struct pipe_screen *screen = sp->pipe.screen;
    /* tile pos in framebuffer: */
    const int tile_x = x & ~(TILE_SIZE - 1);
    const int tile_y = y & ~(TILE_SIZE - 1);
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.h b/src/gallium/drivers/softpipe/sp_tile_cache.h
index bc96c941f61..a66bb50bcc1 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.h
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.h
@@ -96,7 +96,7 @@ sp_get_cached_tile(struct softpipe_context *softpipe,
                    struct softpipe_tile_cache *tc, int x, int y);
 
 extern const struct softpipe_cached_tile *
-sp_get_cached_tile_tex(struct pipe_context *pipe,
+sp_get_cached_tile_tex(struct softpipe_context *softpipe,
                        struct softpipe_tile_cache *tc, int x, int y, int z,
                        int face, int level);
 
diff --git a/src/gallium/include/pipe/p_compiler.h b/src/gallium/include/pipe/p_compiler.h
index 4d64c74a4aa..7bcebd3d6b6 100644
--- a/src/gallium/include/pipe/p_compiler.h
+++ b/src/gallium/include/pipe/p_compiler.h
@@ -144,10 +144,12 @@ typedef unsigned char boolean;
 #define ALIGN16_DECL(TYPE, NAME, SIZE)  TYPE NAME##___aligned[SIZE] __attribute__(( aligned( 16 ) ))
 #define ALIGN16_ASSIGN(NAME) NAME##___aligned
 #define ALIGN16_ATTRIB  __attribute__(( aligned( 16 ) ))
+#define ALIGN8_ATTRIB  __attribute__(( aligned( 8 ) ))
 #else
 #define ALIGN16_DECL(TYPE, NAME, SIZE)  TYPE NAME##___unaligned[SIZE + 1]
 #define ALIGN16_ASSIGN(NAME) align16(NAME##___unaligned)
 #define ALIGN16_ATTRIB
+#define ALIGN8_ATTRIB
 #endif
 
 
diff --git a/src/gallium/include/pipe/p_config.h b/src/gallium/include/pipe/p_config.h
index af3746c0265..05cbd2fc4df 100644
--- a/src/gallium/include/pipe/p_config.h
+++ b/src/gallium/include/pipe/p_config.h
@@ -85,8 +85,19 @@
 #define PIPE_ARCH_X86_64
 #endif
 
-#if 0 /* FIXME */
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+#if defined(PIPE_CC_GCC) && !defined(__SSE2__)
+/* #warning SSE2 support requires -msse -msse2 compiler options */
+#else
+#define PIPE_ARCH_SSE
+#endif
+#endif
+
+#if defined(__PPC__)
 #define PIPE_ARCH_PPC
+#if defined(__PPC64__)
+#define PIPE_ARCH_PPC_64
+#endif
 #endif
 
 
diff --git a/src/gallium/include/pipe/p_inlines.h b/src/gallium/include/pipe/p_inlines.h
index d70de8e3011..5e79b7f485a 100644
--- a/src/gallium/include/pipe/p_inlines.h
+++ b/src/gallium/include/pipe/p_inlines.h
@@ -82,11 +82,14 @@ static INLINE void
 pipe_surface_reference(struct pipe_surface **ptr, struct pipe_surface *surf)
 {
    /* bump the refcount first */
-   if (surf) 
+   if (surf) {
+      assert(surf->refcount);
       surf->refcount++;
+   }
 
    if (*ptr) {
-
+      assert((*ptr)->refcount);
+      
       /* There are currently two sorts of surfaces... This needs to be
        * fixed so that all surfaces are views into a texture.
        */
@@ -113,11 +116,16 @@ winsys_buffer_reference(struct pipe_winsys *winsys,
 		      struct pipe_buffer **ptr,
 		      struct pipe_buffer *buf)
 {
-   if (buf) 
+   if (buf) {
+      assert(buf->refcount);
       buf->refcount++;
+   }
 
-   if (*ptr && --(*ptr)->refcount == 0)
-      winsys->buffer_destroy( winsys, *ptr );
+   if (*ptr) {
+      assert((*ptr)->refcount);
+      if(--(*ptr)->refcount == 0)
+         winsys->buffer_destroy( winsys, *ptr );
+   }
 
    *ptr = buf;
 }
@@ -133,12 +141,15 @@ pipe_texture_reference(struct pipe_texture **ptr,
 {
    assert(ptr);
 
-   if (pt) 
+   if (pt) { 
+      assert(pt->refcount);
       pt->refcount++;
+   }
 
    if (*ptr) {
       struct pipe_screen *screen = (*ptr)->screen;
       assert(screen);
+      assert((*ptr)->refcount);
       screen->texture_release(screen, ptr);
 
       assert(!*ptr);
@@ -154,6 +165,7 @@ pipe_texture_release(struct pipe_texture **ptr)
    struct pipe_screen *screen;
    assert(ptr);
    screen = (*ptr)->screen;
+   assert((*ptr)->refcount);
    screen->texture_release(screen, ptr);
    *ptr = NULL;
 }
@@ -176,12 +188,6 @@ pipe_user_buffer_create( struct pipe_screen *screen, void *ptr, unsigned size )
    return screen->winsys->user_buffer_create(screen->winsys, ptr, size);
 }
 
-static INLINE void
-pipe_buffer_destroy( struct pipe_screen *screen, struct pipe_buffer *buf )
-{
-   screen->winsys->buffer_destroy(screen->winsys, buf);
-}
-
 static INLINE void *
 pipe_buffer_map(struct pipe_screen *screen,
                 struct pipe_buffer *buf,
diff --git a/src/gallium/include/pipe/p_screen.h b/src/gallium/include/pipe/p_screen.h
index b15affef7a5..3bedc752947 100644
--- a/src/gallium/include/pipe/p_screen.h
+++ b/src/gallium/include/pipe/p_screen.h
@@ -26,6 +26,8 @@
  **************************************************************************/
 
 /**
+ * @file
+ * 
  * Screen, Adapter or GPU
  *
  * These are driver functions/facilities that are context independent.
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index a562e3a6b14..ad43d799f32 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -1,4 +1,31 @@
-#if !defined TGSI_TOKEN_H
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_TOKEN_H
 #define TGSI_TOKEN_H
 
 #ifdef __cplusplus
@@ -36,10 +63,10 @@ struct tgsi_processor
 
 struct tgsi_token
 {
-   unsigned Type       : 4;  /* TGSI_TOKEN_TYPE_ */
-   unsigned Size       : 8;  /* UINT */
+   unsigned Type       : 4;  /**< TGSI_TOKEN_TYPE_x */
+   unsigned Size       : 8;  /**< UINT */
    unsigned Padding    : 19;
-   unsigned Extended   : 1;  /* BOOL */
+   unsigned Extended   : 1;  /**< BOOL */
 };
 
 enum tgsi_file_type {
@@ -79,20 +106,22 @@ enum tgsi_file_type {
 
 struct tgsi_declaration
 {
-   unsigned Type        : 4;  /* TGSI_TOKEN_TYPE_DECLARATION */
-   unsigned Size        : 8;  /* UINT */
-   unsigned File        : 4;  /* one of TGSI_FILE_x */
-   unsigned UsageMask   : 4;  /* bitmask of TGSI_WRITEMASK_x flags */
-   unsigned Interpolate : 4;  /* TGSI_INTERPOLATE_ */
-   unsigned Semantic    : 1;  /* BOOL, any semantic info? */
-   unsigned Padding     : 6;
-   unsigned Extended    : 1;  /* BOOL */
+   unsigned Type        : 4;  /**< TGSI_TOKEN_TYPE_DECLARATION */
+   unsigned Size        : 8;  /**< UINT */
+   unsigned File        : 4;  /**< one of TGSI_FILE_x */
+   unsigned UsageMask   : 4;  /**< bitmask of TGSI_WRITEMASK_x flags */
+   unsigned Interpolate : 4;  /**< one of TGSI_INTERPOLATE_x */
+   unsigned Semantic    : 1;  /**< BOOL, any semantic info? */
+   unsigned Centroid    : 1;  /**< centroid sampling? */
+   unsigned Invariant   : 1;  /**< invariant optimization? */
+   unsigned Padding     : 4;
+   unsigned Extended    : 1;  /**< BOOL */
 };
 
 struct tgsi_declaration_range
 {
-   unsigned First   : 16; /* UINT */
-   unsigned Last    : 16; /* UINT */
+   unsigned First   : 16; /**< UINT */
+   unsigned Last    : 16; /**< UINT */
 };
 
 #define TGSI_SEMANTIC_POSITION 0
@@ -106,8 +135,8 @@ struct tgsi_declaration_range
 
 struct tgsi_declaration_semantic
 {
-   unsigned SemanticName   : 8;  /* one of TGSI_SEMANTIC_ */
-   unsigned SemanticIndex  : 16; /* UINT */
+   unsigned SemanticName   : 8;  /**< one of TGSI_SEMANTIC_x */
+   unsigned SemanticIndex  : 16; /**< UINT */
    unsigned Padding        : 8;
 };
 
@@ -115,11 +144,11 @@ struct tgsi_declaration_semantic
 
 struct tgsi_immediate
 {
-   unsigned Type       : 4;  /* TGSI_TOKEN_TYPE_IMMEDIATE */
-   unsigned Size       : 8;  /* UINT */
-   unsigned DataType   : 4;  /* TGSI_IMM_ */
+   unsigned Type       : 4;  /**< TGSI_TOKEN_TYPE_IMMEDIATE */
+   unsigned Size       : 8;  /**< UINT */
+   unsigned DataType   : 4;  /**< one of TGSI_IMM_x */
    unsigned Padding    : 15;
-   unsigned Extended   : 1;  /* BOOL */
+   unsigned Extended   : 1;  /**< BOOL */
 };
 
 struct tgsi_immediate_float32
@@ -396,7 +425,7 @@ struct tgsi_immediate_float32
 #define TGSI_SAT_ZERO_ONE        1  /* clamp to [0,1] */
 #define TGSI_SAT_MINUS_PLUS_ONE  2  /* clamp to [-1,1] */
 
-/*
+/**
  * Opcode is the operation code to execute. A given operation defines the
  * semantics how the source registers (if any) are interpreted and what is
  * written to the destination registers (if any) as a result of execution.
@@ -481,7 +510,7 @@ struct tgsi_instruction_ext
 #define TGSI_SWIZZLE_Z      2
 #define TGSI_SWIZZLE_W      3
 
-/*
+/**
  * Precision controls the precision at which the operation should be executed.
  *
  * CondDstUpdate enables condition code register writes. When this field is
@@ -548,7 +577,7 @@ struct tgsi_instruction_ext_predicate
    unsigned Extended         : 1;    /* BOOL */
 };
 
-/*
+/**
  * File specifies the register array to access.
  *
  * Index specifies the element number of a register in the register file.
@@ -580,7 +609,7 @@ struct tgsi_src_register
    unsigned Extended    : 1;  /* BOOL */
 };
 
-/*
+/**
  * If tgsi_src_register::Extended is TRUE, tgsi_src_register_ext follows.
  * 
  * Then, if tgsi_src_register::Indirect is TRUE, another tgsi_src_register
@@ -599,7 +628,7 @@ struct tgsi_src_register_ext
    unsigned Extended : 1;    /* BOOL */
 };
 
-/*
+/**
  * If tgsi_src_register_ext::Type is TGSI_SRC_REGISTER_EXT_TYPE_SWZ,
  * it should be cast to tgsi_src_register_ext_swz.
  * 
@@ -617,7 +646,7 @@ struct tgsi_src_register_ext
 #define TGSI_EXTSWIZZLE_ZERO    4
 #define TGSI_EXTSWIZZLE_ONE     5
 
-/*
+/**
  * ExtSwizzleX, ExtSwizzleY, ExtSwizzleZ and ExtSwizzleW swizzle the source
  * register in an extended manner.
  *
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index da783389dae..342f17260a7 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -27,6 +27,8 @@
 
 
 /**
+ * @file
+ * 
  * Abstract graphics pipe state objects.
  *
  * Basic notes:
diff --git a/src/gallium/include/pipe/p_thread.h b/src/gallium/include/pipe/p_thread.h
index e01d5a602b8..8af3cd958b0 100644
--- a/src/gallium/include/pipe/p_thread.h
+++ b/src/gallium/include/pipe/p_thread.h
@@ -25,6 +25,8 @@
 
 
 /**
+ * @file
+ * 
  * Thread, mutex, condition var and thread-specific data functions.
  */
 
diff --git a/src/gallium/state_trackers/g3dvl/Makefile b/src/gallium/state_trackers/g3dvl/Makefile
new file mode 100644
index 00000000000..4f7a9534847
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/Makefile
@@ -0,0 +1,18 @@
+TARGET		= libg3dvl.a
+OBJECTS		= vl_display.o vl_screen.o vl_context.o vl_surface.o vl_shader_build.o vl_util.o vl_basic_csc.o	\
+		  vl_r16snorm_mc.o vl_r16snorm_mc_buf.o
+GALLIUMDIR	= ../..
+
+CFLAGS		+= -g -Wall -fPIC -I${GALLIUMDIR}/include -I${GALLIUMDIR}/auxiliary -I${GALLIUMDIR}/winsys/g3dvl
+
+#############################################
+
+.PHONY	= all clean
+
+all: ${TARGET}
+
+${TARGET}: ${OBJECTS}
+	ar rcs $@ $^
+
+clean:
+	rm -rf ${OBJECTS} ${TARGET}
diff --git a/src/gallium/state_trackers/g3dvl/vl_basic_csc.c b/src/gallium/state_trackers/g3dvl/vl_basic_csc.c
new file mode 100644
index 00000000000..9f9dafc8a95
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_basic_csc.c
@@ -0,0 +1,707 @@
+#define VL_INTERNAL
+#include "vl_basic_csc.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <pipe/p_context.h>
+#include <pipe/p_winsys.h>
+#include <pipe/p_state.h>
+#include <tgsi/tgsi_parse.h>
+#include <tgsi/tgsi_build.h>
+#include "vl_csc.h"
+#include "vl_surface.h"
+#include "vl_shader_build.h"
+#include "vl_types.h"
+
+struct vlVertexShaderConsts
+{
+	struct vlVertex4f	dst_scale;
+	struct vlVertex4f	dst_trans;
+	struct vlVertex4f	src_scale;
+	struct vlVertex4f	src_trans;
+};
+
+struct vlFragmentShaderConsts
+{
+	struct vlVertex4f	bias;
+	float			matrix[16];
+};
+
+struct vlBasicCSC
+{
+	struct vlCSC				base;
+
+	struct pipe_context			*pipe;
+	struct pipe_viewport_state		viewport;
+	struct pipe_framebuffer_state		framebuffer;
+	void					*sampler;
+	void					*vertex_shader, *fragment_shader;
+	struct pipe_vertex_buffer 		vertex_bufs[2];
+	struct pipe_vertex_element		vertex_elems[2];
+	struct pipe_constant_buffer		vs_const_buf, fs_const_buf;
+};
+
+static int vlResizeFrameBuffer
+(
+	struct vlCSC *csc,
+	unsigned int width,
+	unsigned int height
+)
+{
+	struct vlBasicCSC	*basic_csc;
+	struct pipe_context	*pipe;
+
+	assert(csc);
+
+	basic_csc = (struct vlBasicCSC*)csc;
+	pipe = basic_csc->pipe;
+
+	if (basic_csc->framebuffer.width == width && basic_csc->framebuffer.height == height)
+		return 0;
+
+	if (basic_csc->framebuffer.cbufs[0])
+		pipe->winsys->surface_release
+		(
+			pipe->winsys,
+			&basic_csc->framebuffer.cbufs[0]
+		);
+
+	basic_csc->viewport.scale[0] = width;
+	basic_csc->viewport.scale[1] = height;
+	basic_csc->viewport.scale[2] = 1;
+	basic_csc->viewport.scale[3] = 1;
+	basic_csc->viewport.translate[0] = 0;
+	basic_csc->viewport.translate[1] = 0;
+	basic_csc->viewport.translate[2] = 0;
+	basic_csc->viewport.translate[3] = 0;
+
+	basic_csc->framebuffer.width = width;
+	basic_csc->framebuffer.height = height;
+	basic_csc->framebuffer.cbufs[0] = pipe->winsys->surface_alloc(pipe->winsys);
+	pipe->winsys->surface_alloc_storage
+	(
+		pipe->winsys,
+		basic_csc->framebuffer.cbufs[0],
+		width,
+		height,
+		PIPE_FORMAT_A8R8G8B8_UNORM,
+		/* XXX: SoftPipe doesn't change GPU usage to CPU like it does for textures */
+		PIPE_BUFFER_USAGE_CPU_READ | PIPE_BUFFER_USAGE_CPU_WRITE,
+		0
+	);
+
+	/* Clear to black, in case video doesn't fill the entire window */
+	pipe->clear(pipe, basic_csc->framebuffer.cbufs[0], 0);
+
+	return 0;
+}
+
+static int vlBegin
+(
+	struct vlCSC *csc
+)
+{
+	struct vlBasicCSC	*basic_csc;
+	struct pipe_context	*pipe;
+
+	assert(csc);
+
+	basic_csc = (struct vlBasicCSC*)csc;
+	pipe = basic_csc->pipe;
+
+	pipe->set_framebuffer_state(pipe, &basic_csc->framebuffer);
+	pipe->set_viewport_state(pipe, &basic_csc->viewport);
+	pipe->bind_sampler_states(pipe, 1, (void**)&basic_csc->sampler);
+	/* Source texture set in vlPutSurface() */
+	pipe->bind_vs_state(pipe, basic_csc->vertex_shader);
+	pipe->bind_fs_state(pipe, basic_csc->fragment_shader);
+	pipe->set_vertex_buffers(pipe, 2, basic_csc->vertex_bufs);
+	pipe->set_vertex_elements(pipe, 2, basic_csc->vertex_elems);
+	pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &basic_csc->vs_const_buf);
+	pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, &basic_csc->fs_const_buf);
+
+	return 0;
+}
+
+static int vlPutPictureCSC
+(
+	struct vlCSC *csc,
+	struct vlSurface *surface,
+	int srcx,
+	int srcy,
+	int srcw,
+	int srch,
+	int destx,
+	int desty,
+	int destw,
+	int desth,
+	enum vlPictureType picture_type
+)
+{
+	struct vlBasicCSC		*basic_csc;
+	struct pipe_context		*pipe;
+	struct vlVertexShaderConsts	*vs_consts;
+
+	assert(csc);
+	assert(surface);
+
+	basic_csc = (struct vlBasicCSC*)csc;
+	pipe = basic_csc->pipe;
+
+	vs_consts = pipe->winsys->buffer_map
+	(
+		pipe->winsys,
+		basic_csc->vs_const_buf.buffer,
+		PIPE_BUFFER_USAGE_CPU_WRITE
+	);
+
+	vs_consts->dst_scale.x = destw / (float)basic_csc->framebuffer.cbufs[0]->width;
+	vs_consts->dst_scale.y = desth / (float)basic_csc->framebuffer.cbufs[0]->height;
+	vs_consts->dst_scale.z = 1;
+	vs_consts->dst_scale.w = 1;
+	vs_consts->dst_trans.x = destx / (float)basic_csc->framebuffer.cbufs[0]->width;
+	vs_consts->dst_trans.y = desty / (float)basic_csc->framebuffer.cbufs[0]->height;
+	vs_consts->dst_trans.z = 0;
+	vs_consts->dst_trans.w = 0;
+
+	vs_consts->src_scale.x = srcw / (float)surface->texture->width[0];
+	vs_consts->src_scale.y = srch / (float)surface->texture->height[0];
+	vs_consts->src_scale.z = 1;
+	vs_consts->src_scale.w = 1;
+	vs_consts->src_trans.x = srcx / (float)surface->texture->width[0];
+	vs_consts->src_trans.y = srcy / (float)surface->texture->height[0];
+	vs_consts->src_trans.z = 0;
+	vs_consts->src_trans.w = 0;
+
+	pipe->winsys->buffer_unmap(pipe->winsys, basic_csc->vs_const_buf.buffer);
+
+	pipe->set_sampler_textures(pipe, 1, &surface->texture);
+	pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLE_STRIP, 0, 4);
+
+	return 0;
+}
+
+static int vlEnd
+(
+	struct vlCSC *csc
+)
+{
+	assert(csc);
+
+	return 0;
+}
+
+static struct pipe_surface* vlGetFrameBuffer
+(
+	struct vlCSC *csc
+)
+{
+	struct vlBasicCSC	*basic_csc;
+
+	assert(csc);
+
+	basic_csc = (struct vlBasicCSC*)csc;
+
+	return basic_csc->framebuffer.cbufs[0];
+}
+
+static int vlDestroy
+(
+	struct vlCSC *csc
+)
+{
+	struct vlBasicCSC	*basic_csc;
+	struct pipe_context	*pipe;
+	unsigned int		i;
+
+	assert(csc);
+
+	basic_csc = (struct vlBasicCSC*)csc;
+	pipe = basic_csc->pipe;
+
+	if (basic_csc->framebuffer.cbufs[0])
+		pipe->winsys->surface_release
+		(
+			pipe->winsys,
+			&basic_csc->framebuffer.cbufs[0]
+		);
+
+	pipe->delete_sampler_state(pipe, basic_csc->sampler);
+	pipe->delete_vs_state(pipe, basic_csc->vertex_shader);
+	pipe->delete_fs_state(pipe, basic_csc->fragment_shader);
+
+	for (i = 0; i < 2; ++i)
+		pipe->winsys->buffer_destroy(pipe->winsys, basic_csc->vertex_bufs[i].buffer);
+
+	pipe->winsys->buffer_destroy(pipe->winsys, basic_csc->vs_const_buf.buffer);
+	pipe->winsys->buffer_destroy(pipe->winsys, basic_csc->fs_const_buf.buffer);
+
+	free(basic_csc);
+
+	return 0;
+}
+
+/*
+ * Represents 2 triangles in a strip in normalized coords.
+ * Used to render the surface onto the frame buffer.
+ */
+static const struct vlVertex2f surface_verts[4] =
+{
+	{0.0f, 0.0f},
+	{0.0f, 1.0f},
+	{1.0f, 0.0f},
+	{1.0f, 1.0f}
+};
+
+/*
+ * Represents texcoords for the above. We can use the position values directly.
+ * TODO: Duplicate these in the shader, no need to create a buffer.
+ */
+static const struct vlVertex2f *surface_texcoords = surface_verts;
+
+/*
+ * Identity color conversion constants, for debugging
+ */
+static const struct vlFragmentShaderConsts identity =
+{
+	{
+		0.0f, 0.0f, 0.0f, 0.0f
+	},
+	{
+		1.0f, 0.0f, 0.0f, 0.0f,
+		0.0f, 1.0f, 0.0f, 0.0f,
+		0.0f, 0.0f, 1.0f, 0.0f,
+		0.0f, 0.0f, 0.0f, 1.0f
+	}
+};
+
+/*
+ * Converts ITU-R BT.601 YCbCr pixels to RGB pixels where:
+ * Y is in [16,235], Cb and Cr are in [16,240]
+ * R, G, and B are in [16,235]
+ */
+static const struct vlFragmentShaderConsts bt_601 =
+{
+	{
+		0.0f,		0.501960784f,	0.501960784f,	0.0f
+	},
+	{
+		1.0f,		0.0f,		1.371f,		0.0f,
+		1.0f,		-0.336f,	-0.698f,	0.0f,
+		1.0f,		1.732f,		0.0f,		0.0f,
+		0.0f,		0.0f,		0.0f,		1.0f
+	}
+};
+
+/*
+ * Converts ITU-R BT.601 YCbCr pixels to RGB pixels where:
+ * Y is in [16,235], Cb and Cr are in [16,240]
+ * R, G, and B are in [0,255]
+ */
+static const struct vlFragmentShaderConsts bt_601_full =
+{
+	{
+		0.062745098f,	0.501960784f,	0.501960784f,	0.0f
+	},
+	{
+		1.164f,		0.0f,		1.596f,		0.0f,
+		1.164f,		-0.391f,	-0.813f,	0.0f,
+		1.164f,		2.018f,		0.0f,		0.0f,
+		0.0f,		0.0f,		0.0f,		1.0f
+	}
+};
+
+/*
+ * Converts ITU-R BT.709 YCbCr pixels to RGB pixels where:
+ * Y is in [16,235], Cb and Cr are in [16,240]
+ * R, G, and B are in [16,235]
+ */
+static const struct vlFragmentShaderConsts bt_709 =
+{
+	{
+		0.0f,		0.501960784f,	0.501960784f,	0.0f
+	},
+	{
+		1.0f,		0.0f,		1.540f,		0.0f,
+		1.0f,		-0.183f,	-0.459f,	0.0f,
+		1.0f,		1.816f,		0.0f,		0.0f,
+		0.0f,		0.0f,		0.0f,		1.0f
+	}
+};
+
+/*
+ * Converts ITU-R BT.709 YCbCr pixels to RGB pixels where:
+ * Y is in [16,235], Cb and Cr are in [16,240]
+ * R, G, and B are in [0,255]
+ */
+const struct vlFragmentShaderConsts bt_709_full =
+{
+	{
+		0.062745098f,	0.501960784f,	0.501960784f,	0.0f
+	},
+	{
+		1.164f,		0.0f,		1.793f,		0.0f,
+		1.164f,		-0.213f,	-0.534f,	0.0f,
+		1.164f,		2.115f,		0.0f,		0.0f,
+		0.0f,		0.0f,		0.0f,		1.0f
+	}
+};
+
+static int vlCreateVertexShader
+(
+	struct vlBasicCSC *csc
+)
+{
+	const unsigned int		max_tokens = 50;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(context);
+
+	pipe = csc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Vertex texcoords
+	 */
+	for (i = 0; i < 2; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0		; Scaling vector to scale vertex pos rect to destination size
+	 * decl c1		; Translation vector to move vertex pos rect into position
+	 * decl c2		; Scaling vector to scale texcoord rect to source size
+	 * decl c3		; Translation vector to move texcoord rect into position
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 3);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Vertex texcoords
+	 */
+	for (i = 0; i < 2; i++)
+	{
+		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl t0, t1 */
+	decl = vl_decl_temps(0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * madd o0, i0, c0, c1	; Scale and translate unit output rect to destination size and pos
+	 * madd o1, i1, c2, c3	; Scale and translate unit texcoord rect to source size and pos
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_inst4(TGSI_OPCODE_MADD, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i, TGSI_FILE_CONSTANT, i * 2, TGSI_FILE_CONSTANT, i * 2 + 1);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	csc->vertex_shader = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShader
+(
+	struct vlBasicCSC *csc
+)
+{
+	const unsigned int		max_tokens = 50;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(context);
+
+	pipe = csc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/* decl i0		; Texcoords for s0 */
+	decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, 1, 0, 0, TGSI_INTERPOLATE_LINEAR);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl c0		; Bias vector for CSC
+	 * decl c1-c4		; CSC matrix c1-c4
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 4);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0		; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0 */
+	decl = vl_decl_temps(0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl s0		; Sampler for tex containing picture to display */
+	decl = vl_decl_samplers(0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* tex2d t0, i0, s0	; Read src pixel */
+	inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_SAMPLER, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* sub t0, t0, c0	; Subtract bias vector from pixel */
+	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * dp4 o0.x, t0, c1	; Multiply pixel by the color conversion matrix
+	 * dp4 o0.y, t0, c2
+	 * dp4 o0.z, t0, c3
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_inst3(TGSI_OPCODE_DP4, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, i + 1);
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	csc->fragment_shader = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateDataBufs
+(
+	struct vlBasicCSC *csc
+)
+{
+	struct pipe_context *pipe;
+
+	assert(csc);
+
+	pipe = csc->pipe;
+
+	/*
+	 * Create our vertex buffer and vertex buffer element
+	 * VB contains 4 vertices that render a quad covering the entire window
+	 * to display a rendered surface
+	 * Quad is rendered as a tri strip
+	 */
+	csc->vertex_bufs[0].pitch = sizeof(struct vlVertex2f);
+	csc->vertex_bufs[0].max_index = 3;
+	csc->vertex_bufs[0].buffer_offset = 0;
+	csc->vertex_bufs[0].buffer = pipe->winsys->buffer_create
+	(
+		pipe->winsys,
+		1,
+		PIPE_BUFFER_USAGE_VERTEX,
+		sizeof(struct vlVertex2f) * 4
+	);
+
+	memcpy
+	(
+		pipe->winsys->buffer_map(pipe->winsys, csc->vertex_bufs[0].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+		surface_verts,
+		sizeof(struct vlVertex2f) * 4
+	);
+
+	pipe->winsys->buffer_unmap(pipe->winsys, csc->vertex_bufs[0].buffer);
+
+	csc->vertex_elems[0].src_offset = 0;
+	csc->vertex_elems[0].vertex_buffer_index = 0;
+	csc->vertex_elems[0].nr_components = 2;
+	csc->vertex_elems[0].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+	/*
+	 * Create our texcoord buffer and texcoord buffer element
+	 * Texcoord buffer contains the TCs for mapping the rendered surface to the 4 vertices
+	 */
+	csc->vertex_bufs[1].pitch = sizeof(struct vlVertex2f);
+	csc->vertex_bufs[1].max_index = 3;
+	csc->vertex_bufs[1].buffer_offset = 0;
+	csc->vertex_bufs[1].buffer = pipe->winsys->buffer_create
+	(
+		pipe->winsys,
+		1,
+		PIPE_BUFFER_USAGE_VERTEX,
+		sizeof(struct vlVertex2f) * 4
+	);
+
+	memcpy
+	(
+		pipe->winsys->buffer_map(pipe->winsys, csc->vertex_bufs[1].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+		surface_texcoords,
+		sizeof(struct vlVertex2f) * 4
+	);
+
+	pipe->winsys->buffer_unmap(pipe->winsys, csc->vertex_bufs[1].buffer);
+
+	csc->vertex_elems[1].src_offset = 0;
+	csc->vertex_elems[1].vertex_buffer_index = 1;
+	csc->vertex_elems[1].nr_components = 2;
+	csc->vertex_elems[1].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+	/*
+	 * Create our vertex shader's constant buffer
+	 * Const buffer contains scaling and translation vectors
+	 */
+	csc->vs_const_buf.size = sizeof(struct vlVertexShaderConsts);
+	csc->vs_const_buf.buffer = pipe->winsys->buffer_create
+	(
+		pipe->winsys,
+		1,
+		PIPE_BUFFER_USAGE_CONSTANT,
+		csc->vs_const_buf.size
+	);
+
+	/*
+	 * Create our fragment shader's constant buffer
+	 * Const buffer contains the color conversion matrix and bias vectors
+	 */
+	csc->fs_const_buf.size = sizeof(struct vlFragmentShaderConsts);
+	csc->fs_const_buf.buffer = pipe->winsys->buffer_create
+	(
+		pipe->winsys,
+		1,
+		PIPE_BUFFER_USAGE_CONSTANT,
+		csc->fs_const_buf.size
+	);
+
+	/*
+	 * TODO: Refactor this into a seperate function,
+	 * allow changing the CSC matrix at runtime to switch between regular & full versions
+	 */
+	memcpy
+	(
+		pipe->winsys->buffer_map(pipe->winsys, csc->fs_const_buf.buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+		&bt_601,
+		sizeof(struct vlFragmentShaderConsts)
+	);
+
+	pipe->winsys->buffer_unmap(pipe->winsys, csc->fs_const_buf.buffer);
+
+	return 0;
+}
+
+static int vlInit
+(
+	struct vlBasicCSC *csc
+)
+{
+	struct pipe_context		*pipe;
+	struct pipe_sampler_state	sampler;
+
+	assert(csc);
+
+	pipe = csc->pipe;
+
+	/* Delay creating the FB until vlPutSurface() so we know window size */
+	csc->framebuffer.num_cbufs = 1;
+	csc->framebuffer.cbufs[0] = NULL;
+	csc->framebuffer.zsbuf = NULL;
+
+	sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+	sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+	sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+	sampler.min_img_filter = PIPE_TEX_FILTER_LINEAR;
+	sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+	sampler.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
+	sampler.compare_mode = PIPE_TEX_COMPARE_NONE;
+	sampler.compare_func = PIPE_FUNC_ALWAYS;
+	sampler.normalized_coords = 1;
+	/*sampler.prefilter = ;*/
+	/*sampler.shadow_ambient = ;*/
+	/*sampler.lod_bias = ;*/
+	/*sampler.min_lod = ;*/
+	/*sampler.max_lod = ;*/
+	/*sampler.border_color[i] = ;*/
+	/*sampler.max_anisotropy = ;*/
+	csc->sampler = pipe->create_sampler_state(pipe, &sampler);
+
+	vlCreateVertexShader(csc);
+	vlCreateFragmentShader(csc);
+	vlCreateDataBufs(csc);
+
+	return 0;
+}
+
+int vlCreateBasicCSC
+(
+	struct pipe_context *pipe,
+	struct vlCSC **csc
+)
+{
+	struct vlBasicCSC *basic_csc;
+
+	assert(pipe);
+	assert(csc);
+
+	basic_csc = calloc(1, sizeof(struct vlBasicCSC));
+
+	if (!basic_csc)
+		return 1;
+
+	basic_csc->base.vlResizeFrameBuffer = &vlResizeFrameBuffer;
+	basic_csc->base.vlBegin = &vlBegin;
+	basic_csc->base.vlPutPicture = &vlPutPictureCSC;
+	basic_csc->base.vlEnd = &vlEnd;
+	basic_csc->base.vlGetFrameBuffer = &vlGetFrameBuffer;
+	basic_csc->base.vlDestroy = &vlDestroy;
+	basic_csc->pipe = pipe;
+
+	vlInit(basic_csc);
+
+	*csc = &basic_csc->base;
+
+	return 0;
+}
diff --git a/src/gallium/state_trackers/g3dvl/vl_basic_csc.h b/src/gallium/state_trackers/g3dvl/vl_basic_csc.h
new file mode 100644
index 00000000000..2e17f1d814a
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_basic_csc.h
@@ -0,0 +1,13 @@
+#ifndef vl_basic_csc_h
+#define vl_basic_csc_h
+
+struct pipe_context;
+struct vlCSC;
+
+int vlCreateBasicCSC
+(
+	struct pipe_context *pipe,
+	struct vlCSC **csc
+);
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_context.c b/src/gallium/state_trackers/g3dvl/vl_context.c
new file mode 100644
index 00000000000..fe107e406da
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_context.c
@@ -0,0 +1,210 @@
+#define VL_INTERNAL
+#include "vl_context.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <pipe/p_context.h>
+#include <pipe/p_state.h>
+#include "vl_render.h"
+#include "vl_r16snorm_mc.h"
+#include "vl_r16snorm_mc_buf.h"
+#include "vl_csc.h"
+#include "vl_basic_csc.h"
+
+static int vlInitCommon(struct vlContext *context)
+{
+	struct pipe_context			*pipe;
+	struct pipe_rasterizer_state		rast;
+	struct pipe_blend_state			blend;
+	struct pipe_depth_stencil_alpha_state	dsa;
+	unsigned int				i;
+
+	assert(context);
+
+	pipe = context->pipe;
+
+	rast.flatshade = 1;
+	rast.flatshade_first = 0;
+	rast.light_twoside = 0;
+	rast.front_winding = PIPE_WINDING_CCW;
+	rast.cull_mode = PIPE_WINDING_CW;
+	rast.fill_cw = PIPE_POLYGON_MODE_FILL;
+	rast.fill_ccw = PIPE_POLYGON_MODE_FILL;
+	rast.offset_cw = 0;
+	rast.offset_ccw = 0;
+	rast.scissor = 0;
+	rast.poly_smooth = 0;
+	rast.poly_stipple_enable = 0;
+	rast.point_sprite = 0;
+	rast.point_size_per_vertex = 0;
+	rast.multisample = 0;
+	rast.line_smooth = 0;
+	rast.line_stipple_enable = 0;
+	rast.line_stipple_factor = 0;
+	rast.line_stipple_pattern = 0;
+	rast.line_last_pixel = 0;
+	/* Don't need clipping, but viewport mapping done here */
+	rast.bypass_clipping = 0;
+	rast.bypass_vs = 0;
+	rast.origin_lower_left = 0;
+	rast.line_width = 1;
+	rast.point_smooth = 0;
+	rast.point_size = 1;
+	rast.offset_units = 1;
+	rast.offset_scale = 1;
+	/*rast.sprite_coord_mode[i] = ;*/
+	context->raster = pipe->create_rasterizer_state(pipe, &rast);
+	pipe->bind_rasterizer_state(pipe, context->raster);
+
+	blend.blend_enable = 0;
+	blend.rgb_func = PIPE_BLEND_ADD;
+	blend.rgb_src_factor = PIPE_BLENDFACTOR_ONE;
+	blend.rgb_dst_factor = PIPE_BLENDFACTOR_ONE;
+	blend.alpha_func = PIPE_BLEND_ADD;
+	blend.alpha_src_factor = PIPE_BLENDFACTOR_ONE;
+	blend.alpha_dst_factor = PIPE_BLENDFACTOR_ONE;
+	blend.logicop_enable = 0;
+	blend.logicop_func = PIPE_LOGICOP_CLEAR;
+	/* Needed to allow color writes to FB, even if blending disabled */
+	blend.colormask = PIPE_MASK_RGBA;
+	blend.dither = 0;
+	context->blend = pipe->create_blend_state(pipe, &blend);
+	pipe->bind_blend_state(pipe, context->blend);
+
+	dsa.depth.enabled = 0;
+	dsa.depth.writemask = 0;
+	dsa.depth.func = PIPE_FUNC_ALWAYS;
+	dsa.depth.occlusion_count = 0;
+	for (i = 0; i < 2; ++i)
+	{
+		dsa.stencil[i].enabled = 0;
+		dsa.stencil[i].func = PIPE_FUNC_ALWAYS;
+		dsa.stencil[i].fail_op = PIPE_STENCIL_OP_KEEP;
+		dsa.stencil[i].zpass_op = PIPE_STENCIL_OP_KEEP;
+		dsa.stencil[i].zfail_op = PIPE_STENCIL_OP_KEEP;
+		dsa.stencil[i].ref_value = 0;
+		dsa.stencil[i].value_mask = 0;
+		dsa.stencil[i].write_mask = 0;
+	}
+	dsa.alpha.enabled = 0;
+	dsa.alpha.func = PIPE_FUNC_ALWAYS;
+	dsa.alpha.ref = 0;
+	context->dsa = pipe->create_depth_stencil_alpha_state(pipe, &dsa);
+	pipe->bind_depth_stencil_alpha_state(pipe, context->dsa);
+
+	return 0;
+}
+
+int vlCreateContext
+(
+	struct vlScreen *screen,
+	struct pipe_context *pipe,
+	unsigned int picture_width,
+	unsigned int picture_height,
+	enum vlFormat picture_format,
+	enum vlProfile profile,
+	enum vlEntryPoint entry_point,
+	struct vlContext **context
+)
+{
+	struct vlContext *ctx;
+
+	assert(screen);
+	assert(context);
+	assert(pipe);
+
+	ctx = calloc(1, sizeof(struct vlContext));
+
+	if (!ctx)
+		return 1;
+
+	ctx->screen = screen;
+	ctx->pipe = pipe;
+	ctx->picture_width = picture_width;
+	ctx->picture_height = picture_height;
+	ctx->picture_format = picture_format;
+	ctx->profile = profile;
+	ctx->entry_point = entry_point;
+
+	vlInitCommon(ctx);
+
+	/*vlCreateR16SNormMC(pipe, picture_width, picture_height, picture_format, &ctx->render);*/
+	vlCreateR16SNormBufferedMC(pipe, picture_width, picture_height, picture_format, &ctx->render);
+	vlCreateBasicCSC(pipe, &ctx->csc);
+
+	*context = ctx;
+
+	return 0;
+}
+
+int vlDestroyContext
+(
+	struct vlContext *context
+)
+{
+	assert(context);
+
+	/* XXX: Must unbind shaders before we can delete them for some reason */
+	context->pipe->bind_vs_state(context->pipe, NULL);
+	context->pipe->bind_fs_state(context->pipe, NULL);
+
+	context->render->vlDestroy(context->render);
+	context->csc->vlDestroy(context->csc);
+
+	context->pipe->delete_blend_state(context->pipe, context->blend);
+	context->pipe->delete_rasterizer_state(context->pipe, context->raster);
+	context->pipe->delete_depth_stencil_alpha_state(context->pipe, context->dsa);
+
+	free(context);
+
+	return 0;
+}
+
+struct vlScreen* vlContextGetScreen
+(
+	struct vlContext *context
+)
+{
+	assert(context);
+
+	return context->screen;
+}
+
+struct pipe_context* vlGetPipeContext
+(
+	struct vlContext *context
+)
+{
+	assert(context);
+
+	return context->pipe;
+}
+
+unsigned int vlGetPictureWidth
+(
+	struct vlContext *context
+)
+{
+	assert(context);
+
+	return context->picture_width;
+}
+
+unsigned int vlGetPictureHeight
+(
+	struct vlContext *context
+)
+{
+	assert(context);
+
+	return context->picture_height;
+}
+
+enum vlFormat vlGetPictureFormat
+(
+	struct vlContext *context
+)
+{
+	assert(context);
+
+	return context->picture_format;
+}
diff --git a/src/gallium/state_trackers/g3dvl/vl_context.h b/src/gallium/state_trackers/g3dvl/vl_context.h
new file mode 100644
index 00000000000..3d14634c44e
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_context.h
@@ -0,0 +1,73 @@
+#ifndef vl_context_h
+#define vl_context_h
+
+#include "vl_types.h"
+
+struct pipe_context;
+
+#ifdef VL_INTERNAL
+struct vlRender;
+struct vlCSC;
+
+struct vlContext
+{
+	struct vlScreen		*screen;
+	struct pipe_context	*pipe;
+	unsigned int		picture_width;
+	unsigned int		picture_height;
+	enum vlFormat		picture_format;
+	enum vlProfile		profile;
+	enum vlEntryPoint	entry_point;
+
+	void			*raster;
+	void			*dsa;
+	void			*blend;
+
+	struct vlRender		*render;
+	struct vlCSC		*csc;
+};
+#endif
+
+int vlCreateContext
+(
+	struct vlScreen *screen,
+	struct pipe_context *pipe,
+	unsigned int picture_width,
+	unsigned int picture_height,
+	enum vlFormat picture_format,
+	enum vlProfile profile,
+	enum vlEntryPoint entry_point,
+	struct vlContext **context
+);
+
+int vlDestroyContext
+(
+	struct vlContext *context
+);
+
+struct vlScreen* vlContextGetScreen
+(
+	struct vlContext *context
+);
+
+struct pipe_context* vlGetPipeContext
+(
+	struct vlContext *context
+);
+
+unsigned int vlGetPictureWidth
+(
+	struct vlContext *context
+);
+
+unsigned int vlGetPictureHeight
+(
+	struct vlContext *context
+);
+
+enum vlFormat vlGetPictureFormat
+(
+	struct vlContext *context
+);
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_csc.h b/src/gallium/state_trackers/g3dvl/vl_csc.h
new file mode 100644
index 00000000000..36417a27929
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_csc.h
@@ -0,0 +1,53 @@
+#ifndef vl_csc_h
+#define vl_csc_h
+
+#include "vl_types.h"
+
+struct pipe_surface;
+
+struct vlCSC
+{
+	int (*vlResizeFrameBuffer)
+	(
+		struct vlCSC *csc,
+		unsigned int width,
+		unsigned int height
+	);
+
+	int (*vlBegin)
+	(
+		struct vlCSC *csc
+	);
+
+	int (*vlPutPicture)
+	(
+		struct vlCSC *csc,
+		struct vlSurface *surface,
+		int srcx,
+		int srcy,
+		int srcw,
+		int srch,
+		int destx,
+		int desty,
+		int destw,
+		int desth,
+		enum vlPictureType picture_type
+	);
+
+	int (*vlEnd)
+	(
+		struct vlCSC *csc
+	);
+
+	struct pipe_surface* (*vlGetFrameBuffer)
+	(
+		struct vlCSC *csc
+	);
+
+	int (*vlDestroy)
+	(
+		struct vlCSC *csc
+	);
+};
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_defs.h b/src/gallium/state_trackers/g3dvl/vl_defs.h
new file mode 100644
index 00000000000..d612d02502f
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_defs.h
@@ -0,0 +1,11 @@
+#ifndef vl_defs_h
+#define vl_defs_h
+
+#define VL_BLOCK_WIDTH		8
+#define VL_BLOCK_HEIGHT		8
+#define VL_BLOCK_SIZE		(VL_BLOCK_WIDTH * VL_BLOCK_HEIGHT)
+#define VL_MACROBLOCK_WIDTH	16
+#define VL_MACROBLOCK_HEIGHT	16
+#define VL_MACROBLOCK_SIZE	(VL_MACROBLOCK_WIDTH * VL_MACROBLOCK_HEIGHT)
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_display.c b/src/gallium/state_trackers/g3dvl/vl_display.c
new file mode 100644
index 00000000000..af80faa7f52
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_display.c
@@ -0,0 +1,48 @@
+#define VL_INTERNAL
+#include "vl_display.h"
+#include <assert.h>
+#include <stdlib.h>
+
+int vlCreateDisplay
+(
+	vlNativeDisplay native_display,
+	struct vlDisplay **display
+)
+{
+	struct vlDisplay *dpy;
+
+	assert(native_display);
+	assert(display);
+
+	dpy = calloc(1, sizeof(struct vlDisplay));
+
+	if (!dpy)
+		return 1;
+
+	dpy->native = native_display;
+	*display = dpy;
+
+	return 0;
+}
+
+int vlDestroyDisplay
+(
+	struct vlDisplay *display
+)
+{
+	assert(display);
+
+	free(display);
+
+	return 0;
+}
+
+vlNativeDisplay vlGetNativeDisplay
+(
+	struct vlDisplay *display
+)
+{
+	assert(display);
+
+	return display->native;
+}
diff --git a/src/gallium/state_trackers/g3dvl/vl_display.h b/src/gallium/state_trackers/g3dvl/vl_display.h
new file mode 100644
index 00000000000..e11fd407993
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_display.h
@@ -0,0 +1,29 @@
+#ifndef vl_display_h
+#define vl_display_h
+
+#include "vl_types.h"
+
+#ifdef VL_INTERNAL
+struct vlDisplay
+{
+	vlNativeDisplay native;
+};
+#endif
+
+int vlCreateDisplay
+(
+	vlNativeDisplay native_display,
+	struct vlDisplay **display
+);
+
+int vlDestroyDisplay
+(
+	struct vlDisplay *display
+);
+
+vlNativeDisplay vlGetNativeDisplay
+(
+	struct vlDisplay *display
+);
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.c b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.c
new file mode 100644
index 00000000000..3272220ef83
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.c
@@ -0,0 +1,2344 @@
+#define VL_INTERNAL
+#include "vl_r16snorm_mc.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <pipe/p_context.h>
+#include <pipe/p_winsys.h>
+#include <pipe/p_state.h>
+#include <pipe/p_inlines.h>
+#include <tgsi/tgsi_parse.h>
+#include <tgsi/tgsi_build.h>
+#include "vl_render.h"
+#include "vl_shader_build.h"
+#include "vl_surface.h"
+#include "vl_util.h"
+#include "vl_types.h"
+#include "vl_defs.h"
+
+#define NUM_BUFS 4	/* Number of rotating buffers to use */
+
+struct vlVertexShaderConsts
+{
+	/*struct vlVertex4f scale;
+	struct vlVertex4f denorm;*/
+	struct vlVertex4f	scale;
+	struct vlVertex4f	mb_pos_trans;
+	struct vlVertex4f	denorm;
+	struct
+	{
+		struct vlVertex4f	top_field;
+		struct vlVertex4f	bottom_field;
+	} mb_tc_trans[2];
+};
+
+struct vlFragmentShaderConsts
+{
+	struct vlVertex4f multiplier;
+	struct vlVertex4f div;
+};
+
+struct vlR16SnormMC
+{
+	struct vlRender				base;
+
+	unsigned int				video_width, video_height;
+	enum vlFormat				video_format;
+	unsigned int				cur_buf;
+
+	struct pipe_context			*pipe;
+	struct pipe_viewport_state		viewport;
+	struct pipe_framebuffer_state		render_target;
+	struct pipe_sampler_state		*samplers[5];
+	struct pipe_texture			*textures[NUM_BUFS][5];
+	void					*i_vs, *p_vs[2], *b_vs[2];
+	void					*i_fs, *p_fs[2], *b_fs[2];
+	struct pipe_vertex_buffer 		vertex_bufs[3];
+	struct pipe_vertex_element		vertex_elems[3];
+	struct pipe_constant_buffer		vs_const_buf, fs_const_buf;
+};
+
+static int vlBegin
+(
+	struct vlRender *render
+)
+{
+	struct vlR16SnormMC	*mc;
+	struct pipe_context	*pipe;
+
+	assert(render);
+
+	mc = (struct vlR16SnormMC*)render;
+	pipe = mc->pipe;
+
+	/* Frame buffer set in vlRender*Macroblock() */
+	/* Shaders, samplers, textures set in vlRender*Macroblock() */
+	pipe->set_vertex_buffers(pipe, 3, mc->vertex_bufs);
+	pipe->set_vertex_elements(pipe, 3, mc->vertex_elems);
+	pipe->set_viewport_state(pipe, &mc->viewport);
+	pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &mc->vs_const_buf);
+	pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, &mc->fs_const_buf);
+
+	return 0;
+}
+
+/*static int vlGrabMacroBlock
+(
+	struct vlR16SnormMC *mc,
+	struct vlMpeg2MacroBlock *macroblock
+)
+{
+	assert(mc);
+	assert(macroblock);
+
+
+
+	return 0;
+}*/
+
+/*#define DO_IDCT*/
+
+#ifdef DO_IDCT
+static int vlTransformBlock(short *src, short *dst, short bias)
+{
+	static const float basis[8][8] =
+	{
+		{0.3536,   0.4904,   0.4619,   0.4157,   0.3536,   0.2778,   0.1913,   0.0975},
+		{0.3536,   0.4157,   0.1913,  -0.0975,  -0.3536,  -0.4904,  -0.4619,  -0.2778},
+		{0.3536,   0.2778,  -0.1913,  -0.4904,  -0.3536,   0.0975,   0.4619,   0.4157},
+		{0.3536,   0.0975,  -0.4619,  -0.2778,   0.3536,   0.4157,  -0.1913,  -0.4904},
+		{0.3536,  -0.0975,  -0.4619,   0.2778,   0.3536,  -0.4157,  -0.1913,   0.4904},
+		{0.3536,  -0.2778,  -0.1913,   0.4904,  -0.3536,  -0.0975,   0.4619,  -0.4157},
+		{0.3536,  -0.4157,   0.1913,   0.0975,  -0.3536,   0.4904,  -0.4619,   0.2778},
+		{0.3536,  -0.4904,   0.4619,  -0.4157,   0.3536,  -0.2778,   0.1913,  -0.0975}
+	};
+
+	unsigned int	x, y;
+	short		tmp[64];
+
+	for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
+		for (x = 0; x < VL_BLOCK_WIDTH; ++x)
+			tmp[y * VL_BLOCK_WIDTH + x] = (short)
+			(
+				src[y * VL_BLOCK_WIDTH + 0] * basis[x][0] +
+				src[y * VL_BLOCK_WIDTH + 1] * basis[x][1] +
+				src[y * VL_BLOCK_WIDTH + 2] * basis[x][2] +
+				src[y * VL_BLOCK_WIDTH + 3] * basis[x][3] +
+				src[y * VL_BLOCK_WIDTH + 4] * basis[x][4] +
+				src[y * VL_BLOCK_WIDTH + 5] * basis[x][5] +
+				src[y * VL_BLOCK_WIDTH + 6] * basis[x][6] +
+				src[y * VL_BLOCK_WIDTH + 7] * basis[x][7]
+			);
+
+	for (x = 0; x < VL_BLOCK_WIDTH; ++x)
+		for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
+		{
+			dst[y * VL_BLOCK_WIDTH + x] = bias + (short)
+			(
+				tmp[0 * VL_BLOCK_WIDTH + x] * basis[y][0] +
+				tmp[1 * VL_BLOCK_WIDTH + x] * basis[y][1] +
+				tmp[2 * VL_BLOCK_WIDTH + x] * basis[y][2] +
+				tmp[3 * VL_BLOCK_WIDTH + x] * basis[y][3] +
+				tmp[4 * VL_BLOCK_WIDTH + x] * basis[y][4] +
+				tmp[5 * VL_BLOCK_WIDTH + x] * basis[y][5] +
+				tmp[6 * VL_BLOCK_WIDTH + x] * basis[y][6] +
+				tmp[7 * VL_BLOCK_WIDTH + x] * basis[y][7]
+			);
+			if (dst[y * VL_BLOCK_WIDTH + x] > 255)
+				dst[y * VL_BLOCK_WIDTH + x] = 255;
+			else if (bias > 0 && dst[y * VL_BLOCK_WIDTH + x] < 0)
+				dst[y * VL_BLOCK_WIDTH + x] = 0;
+		}
+	return 0;
+}
+#endif
+
+static int vlGrabFrameCodedBlock(short *src, short *dst, unsigned int dst_pitch)
+{
+	unsigned int y;
+
+	for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
+		memcpy
+		(
+			dst + y * dst_pitch,
+			src + y * VL_BLOCK_WIDTH,
+			VL_BLOCK_WIDTH * 2
+		);
+
+	return 0;
+}
+
+static int vlGrabFieldCodedBlock(short *src, short *dst, unsigned int dst_pitch)
+{
+	unsigned int y;
+
+	for (y = 0; y < VL_BLOCK_HEIGHT / 2; ++y)
+		memcpy
+		(
+			dst + y * dst_pitch * 2,
+			src + y * VL_BLOCK_WIDTH,
+			VL_BLOCK_WIDTH * 2
+		);
+
+	dst += VL_BLOCK_HEIGHT * dst_pitch;
+
+	for (; y < VL_BLOCK_HEIGHT; ++y)
+		memcpy
+		(
+			dst + y * dst_pitch * 2,
+			src + y * VL_BLOCK_WIDTH,
+			VL_BLOCK_WIDTH * 2
+		);
+
+	return 0;
+}
+
+static int vlGrabNoBlock(short *dst, unsigned int dst_pitch)
+{
+	unsigned int y;
+
+	for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
+		memset
+		(
+			dst + y * dst_pitch,
+			0,
+			VL_BLOCK_WIDTH * 2
+		);
+
+	return 0;
+}
+
+enum vlSampleType
+{
+	vlSampleTypeFull,
+	vlSampleTypeDiff
+};
+
+static int vlGrabBlocks
+(
+	struct vlR16SnormMC *mc,
+	unsigned int coded_block_pattern,
+	enum vlDCTType dct_type,
+	enum vlSampleType sample_type,
+	short *blocks
+)
+{
+	struct pipe_surface	*tex_surface;
+	short			*texels;
+	unsigned int		tex_pitch;
+	unsigned int		tb, sb = 0;
+
+	assert(mc);
+	assert(blocks);
+
+	tex_surface = mc->pipe->screen->get_tex_surface
+	(
+		mc->pipe->screen,
+		mc->textures[mc->cur_buf % NUM_BUFS][0],
+		0, 0, 0, PIPE_BUFFER_USAGE_CPU_WRITE
+	);
+
+	texels = pipe_surface_map(tex_surface, PIPE_BUFFER_USAGE_CPU_WRITE);
+	tex_pitch = tex_surface->stride / tex_surface->block.size;
+
+	for (tb = 0; tb < 4; ++tb)
+	{
+		if ((coded_block_pattern >> (5 - tb)) & 1)
+		{
+			short *cur_block = blocks + sb * VL_BLOCK_WIDTH * VL_BLOCK_HEIGHT;
+
+#ifdef DO_IDCT
+			vlTransformBlock(cur_block, cur_block, sample_type == vlSampleTypeFull ? 128 : 0);
+#endif
+
+			if (dct_type == vlDCTTypeFrameCoded)
+				vlGrabFrameCodedBlock
+				(
+					cur_block,
+					texels + tb * tex_pitch * VL_BLOCK_HEIGHT,
+					tex_pitch
+				);
+			else
+				vlGrabFieldCodedBlock
+				(
+					cur_block,
+					texels + (tb % 2) * tex_pitch * VL_BLOCK_HEIGHT + (tb / 2) * tex_pitch,
+					tex_pitch
+				);
+
+			++sb;
+		}
+		else
+			vlGrabNoBlock(texels + tb * tex_pitch * VL_BLOCK_HEIGHT, tex_pitch);
+	}
+
+	pipe_surface_unmap(tex_surface);
+
+	/* TODO: Implement 422, 444 */
+	for (tb = 0; tb < 2; ++tb)
+	{
+		tex_surface = mc->pipe->screen->get_tex_surface
+		(
+			mc->pipe->screen,
+			mc->textures[mc->cur_buf % NUM_BUFS][tb + 1],
+			0, 0, 0, PIPE_BUFFER_USAGE_CPU_WRITE
+		);
+
+		texels = pipe_surface_map(tex_surface, PIPE_BUFFER_USAGE_CPU_WRITE);
+		tex_pitch = tex_surface->stride / tex_surface->block.size;
+
+		if ((coded_block_pattern >> (1 - tb)) & 1)
+		{
+			short *cur_block = blocks + sb * VL_BLOCK_WIDTH * VL_BLOCK_HEIGHT;
+
+#ifdef DO_IDCT
+			vlTransformBlock(cur_block, cur_block, sample_type == vlSampleTypeFull ? 128 : 0);
+#endif
+
+			vlGrabFrameCodedBlock
+			(
+				cur_block,
+				texels,
+				tex_pitch
+			);
+
+			++sb;
+		}
+		else
+			vlGrabNoBlock(texels, tex_pitch);
+
+		pipe_surface_unmap(tex_surface);
+	}
+
+	return 0;
+}
+
+static int vlRenderIMacroBlock
+(
+	struct vlR16SnormMC *mc,
+	enum vlPictureType picture_type,
+	enum vlFieldOrder field_order,
+	unsigned int mbx,
+	unsigned int mby,
+	unsigned int coded_block_pattern,
+	enum vlDCTType dct_type,
+	short *blocks,
+	struct vlSurface *surface
+)
+{
+	struct pipe_context		*pipe;
+	struct vlVertexShaderConsts	*vs_consts;
+
+	assert(blocks);
+	assert(surface);
+
+	/* TODO: Implement interlaced rendering */
+	if (picture_type != vlPictureTypeFrame)
+		return 0;
+
+	vlGrabBlocks(mc, coded_block_pattern, dct_type, vlSampleTypeFull, blocks);
+
+	pipe = mc->pipe;
+
+	vs_consts = pipe->winsys->buffer_map
+	(
+		pipe->winsys,
+		mc->vs_const_buf.buffer,
+		PIPE_BUFFER_USAGE_CPU_WRITE
+	);
+
+	vs_consts->scale.x = VL_MACROBLOCK_WIDTH / (float)surface->texture->width[0];
+	vs_consts->scale.y = VL_MACROBLOCK_HEIGHT / (float)surface->texture->height[0];
+	vs_consts->scale.z = 1.0f;
+	vs_consts->scale.w = 1.0f;
+	vs_consts->mb_pos_trans.x = (mbx * VL_MACROBLOCK_WIDTH) / (float)surface->texture->width[0];
+	vs_consts->mb_pos_trans.y = (mby * VL_MACROBLOCK_HEIGHT) / (float)surface->texture->height[0];
+	vs_consts->mb_pos_trans.z = 0.0f;
+	vs_consts->mb_pos_trans.w = 0.0f;
+
+	pipe->winsys->buffer_unmap(pipe->winsys, mc->vs_const_buf.buffer);
+
+	mc->render_target.cbufs[0] = pipe->screen->get_tex_surface
+	(
+		pipe->screen,
+		surface->texture,
+		0, 0, 0, PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE
+	);
+	pipe->set_framebuffer_state(pipe, &mc->render_target);
+	pipe->set_sampler_textures(pipe, 3, mc->textures[mc->cur_buf % NUM_BUFS]);
+	pipe->bind_sampler_states(pipe, 3, (void**)mc->samplers);
+	pipe->bind_vs_state(pipe, mc->i_vs);
+	pipe->bind_fs_state(pipe, mc->i_fs);
+
+	pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, 24);
+
+	mc->cur_buf++;
+
+	return 0;
+}
+
+static int vlRenderPMacroBlock
+(
+	struct vlR16SnormMC *mc,
+	enum vlPictureType picture_type,
+	enum vlFieldOrder field_order,
+	unsigned int mbx,
+	unsigned int mby,
+	enum vlMotionType mc_type,
+	short top_x,
+	short top_y,
+	short bottom_x,
+	short bottom_y,
+	unsigned int coded_block_pattern,
+	enum vlDCTType dct_type,
+	short *blocks,
+	struct vlSurface *ref_surface,
+	struct vlSurface *surface
+)
+{
+	struct pipe_context		*pipe;
+	struct vlVertexShaderConsts	*vs_consts;
+
+	assert(motion_vectors);
+	assert(blocks);
+	assert(ref_surface);
+	assert(surface);
+
+	/* TODO: Implement interlaced rendering */
+	if (picture_type != vlPictureTypeFrame)
+		return 0;
+	/* TODO: Implement other MC types */
+	if (mc_type != vlMotionTypeFrame && mc_type != vlMotionTypeField)
+		return 0;
+
+	vlGrabBlocks(mc, coded_block_pattern, dct_type, vlSampleTypeDiff, blocks);
+
+	pipe = mc->pipe;
+
+	vs_consts = pipe->winsys->buffer_map
+	(
+		pipe->winsys,
+		mc->vs_const_buf.buffer,
+		PIPE_BUFFER_USAGE_CPU_WRITE
+	);
+
+	vs_consts->scale.x = VL_MACROBLOCK_WIDTH / (float)surface->texture->width[0];
+	vs_consts->scale.y = VL_MACROBLOCK_HEIGHT / (float)surface->texture->height[0];
+	vs_consts->scale.z = 1.0f;
+	vs_consts->scale.w = 1.0f;
+	vs_consts->mb_pos_trans.x = (mbx * VL_MACROBLOCK_WIDTH) / (float)surface->texture->width[0];
+	vs_consts->mb_pos_trans.y = (mby * VL_MACROBLOCK_HEIGHT) / (float)surface->texture->height[0];
+	vs_consts->mb_pos_trans.z = 0.0f;
+	vs_consts->mb_pos_trans.w = 0.0f;
+	vs_consts->mb_tc_trans[0].top_field.x = (mbx * VL_MACROBLOCK_WIDTH + top_x * 0.5f) / (float)surface->texture->width[0];
+	vs_consts->mb_tc_trans[0].top_field.y = (mby * VL_MACROBLOCK_HEIGHT + top_y * 0.5f) / (float)surface->texture->height[0];
+	vs_consts->mb_tc_trans[0].top_field.z = 0.0f;
+	vs_consts->mb_tc_trans[0].top_field.w = 0.0f;
+
+	if (mc_type == vlMotionTypeField)
+	{
+		vs_consts->denorm.x = (float)surface->texture->width[0];
+		vs_consts->denorm.y = (float)surface->texture->height[0];
+
+		vs_consts->mb_tc_trans[0].bottom_field.x = (mbx * VL_MACROBLOCK_WIDTH + bottom_x * 0.5f) / (float)surface->texture->width[0];
+		vs_consts->mb_tc_trans[0].bottom_field.y = (mby * VL_MACROBLOCK_HEIGHT + bottom_y * 0.5f) / (float)surface->texture->height[0];
+		vs_consts->mb_tc_trans[0].bottom_field.z = 0.0f;
+		vs_consts->mb_tc_trans[0].bottom_field.w = 0.0f;
+
+		pipe->bind_vs_state(pipe, mc->p_vs[1]);
+		pipe->bind_fs_state(pipe, mc->p_fs[1]);
+	}
+	else
+	{
+		pipe->bind_vs_state(pipe, mc->p_vs[0]);
+		pipe->bind_fs_state(pipe, mc->p_fs[0]);
+	}
+
+	pipe->winsys->buffer_unmap(pipe->winsys, mc->vs_const_buf.buffer);
+
+	mc->render_target.cbufs[0] = pipe->screen->get_tex_surface
+	(
+		pipe->screen,
+		surface->texture,
+		0, 0, 0, PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE
+	);
+	pipe->set_framebuffer_state(pipe, &mc->render_target);
+
+	mc->textures[mc->cur_buf % NUM_BUFS][3] = ref_surface->texture;
+	pipe->set_sampler_textures(pipe, 4, mc->textures[mc->cur_buf % NUM_BUFS]);
+	pipe->bind_sampler_states(pipe, 4, (void**)mc->samplers);
+
+	pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, 24);
+
+	mc->cur_buf++;
+
+	return 0;
+}
+
+static int vlRenderBMacroBlock
+(
+	struct vlR16SnormMC *mc,
+	enum vlPictureType picture_type,
+	enum vlFieldOrder field_order,
+	unsigned int mbx,
+	unsigned int mby,
+	enum vlMotionType mc_type,
+	short top_past_x,
+	short top_past_y,
+	short bottom_past_x,
+	short bottom_past_y,
+	short top_future_x,
+	short top_future_y,
+	short bottom_future_x,
+	short bottom_future_y,
+	unsigned int coded_block_pattern,
+	enum vlDCTType dct_type,
+	short *blocks,
+	struct vlSurface *past_surface,
+	struct vlSurface *future_surface,
+	struct vlSurface *surface
+)
+{
+	struct pipe_context		*pipe;
+	struct vlVertexShaderConsts	*vs_consts;
+
+	assert(motion_vectors);
+	assert(blocks);
+	assert(ref_surface);
+	assert(surface);
+
+	/* TODO: Implement interlaced rendering */
+	if (picture_type != vlPictureTypeFrame)
+		return 0;
+	/* TODO: Implement other MC types */
+	if (mc_type != vlMotionTypeFrame && mc_type != vlMotionTypeField)
+		return 0;
+
+	vlGrabBlocks(mc, coded_block_pattern, dct_type, vlSampleTypeDiff, blocks);
+
+	pipe = mc->pipe;
+
+	vs_consts = pipe->winsys->buffer_map
+	(
+		pipe->winsys,
+		mc->vs_const_buf.buffer,
+		PIPE_BUFFER_USAGE_CPU_WRITE
+	);
+
+	vs_consts->scale.x = VL_MACROBLOCK_WIDTH / (float)surface->texture->width[0];
+	vs_consts->scale.y = VL_MACROBLOCK_HEIGHT / (float)surface->texture->height[0];
+	vs_consts->scale.z = 1.0f;
+	vs_consts->scale.w = 1.0f;
+	vs_consts->mb_pos_trans.x = (mbx * VL_MACROBLOCK_WIDTH) / (float)surface->texture->width[0];
+	vs_consts->mb_pos_trans.y = (mby * VL_MACROBLOCK_HEIGHT) / (float)surface->texture->height[0];
+	vs_consts->mb_pos_trans.z = 0.0f;
+	vs_consts->mb_pos_trans.w = 0.0f;
+	vs_consts->mb_tc_trans[0].top_field.x = (mbx * VL_MACROBLOCK_WIDTH + top_past_x * 0.5f) / (float)surface->texture->width[0];
+	vs_consts->mb_tc_trans[0].top_field.y = (mby * VL_MACROBLOCK_HEIGHT + top_past_y * 0.5f) / (float)surface->texture->height[0];
+	vs_consts->mb_tc_trans[0].top_field.z = 0.0f;
+	vs_consts->mb_tc_trans[0].top_field.w = 0.0f;
+	vs_consts->mb_tc_trans[1].top_field.x = (mbx * VL_MACROBLOCK_WIDTH + top_future_x * 0.5f) / (float)surface->texture->width[0];
+	vs_consts->mb_tc_trans[1].top_field.y = (mby * VL_MACROBLOCK_HEIGHT + top_future_y * 0.5f) / (float)surface->texture->height[0];
+	vs_consts->mb_tc_trans[1].top_field.z = 0.0f;
+	vs_consts->mb_tc_trans[1].top_field.w = 0.0f;
+
+	if (mc_type == vlMotionTypeField)
+	{
+		vs_consts->denorm.x = (float)surface->texture->width[0];
+		vs_consts->denorm.y = (float)surface->texture->height[0];
+
+		vs_consts->mb_tc_trans[0].bottom_field.x = (mbx * VL_MACROBLOCK_WIDTH + bottom_past_x * 0.5f) / (float)surface->texture->width[0];
+		vs_consts->mb_tc_trans[0].bottom_field.y = (mby * VL_MACROBLOCK_HEIGHT + bottom_past_y * 0.5f) / (float)surface->texture->height[0];
+		vs_consts->mb_tc_trans[0].bottom_field.z = 0.0f;
+		vs_consts->mb_tc_trans[0].bottom_field.w = 0.0f;
+		vs_consts->mb_tc_trans[1].bottom_field.x = (mbx * VL_MACROBLOCK_WIDTH + bottom_future_x * 0.5f) / (float)surface->texture->width[0];
+		vs_consts->mb_tc_trans[1].bottom_field.y = (mby * VL_MACROBLOCK_HEIGHT + bottom_future_y * 0.5f) / (float)surface->texture->height[0];
+		vs_consts->mb_tc_trans[1].bottom_field.z = 0.0f;
+		vs_consts->mb_tc_trans[1].bottom_field.w = 0.0f;
+
+		pipe->bind_vs_state(pipe, mc->b_vs[1]);
+		pipe->bind_fs_state(pipe, mc->b_fs[1]);
+	}
+	else
+	{
+		pipe->bind_vs_state(pipe, mc->b_vs[0]);
+		pipe->bind_fs_state(pipe, mc->b_fs[0]);
+	}
+
+	pipe->winsys->buffer_unmap(pipe->winsys, mc->vs_const_buf.buffer);
+
+	mc->render_target.cbufs[0] = pipe->screen->get_tex_surface
+	(
+		pipe->screen,
+		surface->texture,
+		0, 0, 0, PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE
+	);
+	pipe->set_framebuffer_state(pipe, &mc->render_target);
+
+	mc->textures[mc->cur_buf % NUM_BUFS][3] = past_surface->texture;
+	mc->textures[mc->cur_buf % NUM_BUFS][4] = future_surface->texture;
+	pipe->set_sampler_textures(pipe, 5, mc->textures[mc->cur_buf % NUM_BUFS]);
+	pipe->bind_sampler_states(pipe, 5, (void**)mc->samplers);
+
+	pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, 24);
+
+	mc->cur_buf++;
+
+	return 0;
+}
+
+static int vlRenderMacroBlocksMpeg2R16Snorm
+(
+	struct vlRender *render,
+	struct vlMpeg2MacroBlockBatch *batch,
+	struct vlSurface *surface
+)
+{
+	struct vlR16SnormMC	*mc;
+	unsigned int		i;
+
+	assert(render);
+
+	mc = (struct vlR16SnormMC*)render;
+
+	/*for (i = 0; i < batch->num_macroblocks; ++i)
+		vlGrabMacroBlock(batch->macroblocks[i]);*/
+
+	for (i = 0; i < batch->num_macroblocks; ++i)
+	{
+		switch (batch->macroblocks[i].mb_type)
+		{
+			case vlMacroBlockTypeIntra:
+			{
+				vlRenderIMacroBlock
+				(
+					mc,
+					batch->picture_type,
+					batch->field_order,
+					batch->macroblocks[i].mbx,
+					batch->macroblocks[i].mby,
+					batch->macroblocks[i].cbp,
+					batch->macroblocks[i].dct_type,
+					batch->macroblocks[i].blocks,
+					surface
+				);
+				break;
+			}
+			case vlMacroBlockTypeFwdPredicted:
+			{
+				vlRenderPMacroBlock
+				(
+					mc,
+					batch->picture_type,
+					batch->field_order,
+					batch->macroblocks[i].mbx,
+					batch->macroblocks[i].mby,
+					batch->macroblocks[i].mo_type,
+					batch->macroblocks[i].PMV[0][0][0],
+					batch->macroblocks[i].PMV[0][0][1],
+					batch->macroblocks[i].PMV[1][0][0],
+					batch->macroblocks[i].PMV[1][0][1],
+					batch->macroblocks[i].cbp,
+					batch->macroblocks[i].dct_type,
+					batch->macroblocks[i].blocks,
+					batch->past_surface,
+					surface
+				);
+				break;
+			}
+			case vlMacroBlockTypeBkwdPredicted:
+			{
+				vlRenderPMacroBlock
+				(
+					mc,
+					batch->picture_type,
+					batch->field_order,
+					batch->macroblocks[i].mbx,
+					batch->macroblocks[i].mby,
+					batch->macroblocks[i].mo_type,
+					batch->macroblocks[i].PMV[0][1][0],
+					batch->macroblocks[i].PMV[0][1][1],
+					batch->macroblocks[i].PMV[1][1][0],
+					batch->macroblocks[i].PMV[1][1][1],
+					batch->macroblocks[i].cbp,
+					batch->macroblocks[i].dct_type,
+					batch->macroblocks[i].blocks,
+					batch->future_surface,
+					surface
+				);
+				break;
+			}
+			case vlMacroBlockTypeBiPredicted:
+			{
+				vlRenderBMacroBlock
+				(
+					mc,
+					batch->picture_type,
+					batch->field_order,
+					batch->macroblocks[i].mbx,
+					batch->macroblocks[i].mby,
+					batch->macroblocks[i].mo_type,
+					batch->macroblocks[i].PMV[0][0][0],
+					batch->macroblocks[i].PMV[0][0][1],
+					batch->macroblocks[i].PMV[1][0][0],
+					batch->macroblocks[i].PMV[1][0][1],
+					batch->macroblocks[i].PMV[0][1][0],
+					batch->macroblocks[i].PMV[0][1][1],
+					batch->macroblocks[i].PMV[1][1][0],
+					batch->macroblocks[i].PMV[1][1][1],
+					batch->macroblocks[i].cbp,
+					batch->macroblocks[i].dct_type,
+					batch->macroblocks[i].blocks,
+					batch->past_surface,
+					batch->future_surface,
+					surface
+				);
+				break;
+			}
+			default:
+				assert(0);
+		}
+	}
+
+	return 0;
+}
+
+static int vlEnd
+(
+	struct vlRender *render
+)
+{
+	assert(render);
+
+	return 0;
+}
+
+static int vlFlush
+(
+	struct vlRender *render
+)
+{
+	assert(render);
+
+	return 0;
+}
+
+static int vlDestroy
+(
+	struct vlRender *render
+)
+{
+	struct vlR16SnormMC	*mc;
+	struct pipe_context	*pipe;
+	unsigned int		i;
+
+	assert(render);
+
+	mc = (struct vlR16SnormMC*)render;
+	pipe = mc->pipe;
+
+	for (i = 0; i < 5; ++i)
+		pipe->delete_sampler_state(pipe, mc->samplers[i]);
+
+	for (i = 0; i < 3; ++i)
+		pipe->winsys->buffer_destroy(pipe->winsys, mc->vertex_bufs[i].buffer);
+
+	/* Textures 3 & 4 are not created directly, no need to release them here */
+	for (i = 0; i < NUM_BUFS; ++i)
+	{
+		pipe_texture_release(&mc->textures[i][0]);
+		pipe_texture_release(&mc->textures[i][1]);
+		pipe_texture_release(&mc->textures[i][2]);
+	}
+
+	pipe->delete_vs_state(pipe, mc->i_vs);
+	pipe->delete_fs_state(pipe, mc->i_fs);
+
+	for (i = 0; i < 2; ++i)
+	{
+		pipe->delete_vs_state(pipe, mc->p_vs[i]);
+		pipe->delete_fs_state(pipe, mc->p_fs[i]);
+		pipe->delete_vs_state(pipe, mc->b_vs[i]);
+		pipe->delete_fs_state(pipe, mc->b_fs[i]);
+	}
+
+	pipe->winsys->buffer_destroy(pipe->winsys, mc->vs_const_buf.buffer);
+	pipe->winsys->buffer_destroy(pipe->winsys, mc->fs_const_buf.buffer);
+
+	free(mc);
+
+	return 0;
+}
+
+/*
+ * Represents 8 triangles (4 quads, 1 per block) in noormalized coords
+ * that render a macroblock.
+ * Need to be scaled to cover mbW*mbH macroblock pixels and translated into
+ * position on target surface.
+ */
+static const struct vlVertex2f macroblock_verts[24] =
+{
+	{0.0f, 0.0f}, {0.0f, 0.5f}, {0.5f, 0.0f},
+	{0.5f, 0.0f}, {0.0f, 0.5f}, {0.5f, 0.5f},
+
+	{0.5f, 0.0f}, {0.5f, 0.5f}, {1.0f, 0.0f},
+	{1.0f, 0.0f}, {0.5f, 0.5f}, {1.0f, 0.5f},
+
+	{0.0f, 0.5f}, {0.0f, 1.0f}, {0.5f, 0.5f},
+	{0.5f, 0.5f}, {0.0f, 1.0f}, {0.5f, 1.0f},
+
+	{0.5f, 0.5f}, {0.5f, 1.0f}, {1.0f, 0.5f},
+	{1.0f, 0.5f}, {0.5f, 1.0f}, {1.0f, 1.0f}
+};
+
+/*
+ * Represents texcoords for the above for rendering 4 luma blocks arranged
+ * in a bW*(bH*4) texture. First luma block located at 0,0->bW,bH; second at
+ * 0,bH->bW,2bH; third at 0,2bH->bW,3bH; fourth at 0,3bH->bW,4bH.
+ */
+static const struct vlVertex2f macroblock_luma_texcoords[24] =
+{
+	{0.0f, 0.0f}, {0.0f, 0.25f}, {1.0f, 0.0f},
+	{1.0f, 0.0f}, {0.0f, 0.25f}, {1.0f, 0.25f},
+
+	{0.0f, 0.25f}, {0.0f, 0.5f}, {1.0f, 0.25f},
+	{1.0f, 0.25f}, {0.0f, 0.5f}, {1.0f, 0.5f},
+
+	{0.0f, 0.5f}, {0.0f, 0.75f}, {1.0f, 0.5f},
+	{1.0f, 0.5f}, {0.0f, 0.75f}, {1.0f, 0.75f},
+
+	{0.0f, 0.75f}, {0.0f, 1.0f}, {1.0f, 0.75f},
+	{1.0f, 0.75f}, {0.0f, 1.0f}, {1.0f, 1.0f}
+};
+
+/*
+ * Represents texcoords for the above for rendering 1 chroma block.
+ * Straight forward 0,0->1,1 mapping so we can reuse the MB pos vectors.
+ */
+static const struct vlVertex2f *macroblock_chroma_420_texcoords = macroblock_verts;
+
+/*
+ * Represents texcoords for the above for rendering 2 chroma blocks arranged
+ * in a bW*(bH*2) texture. First chroma block located at 0,0->bW,bH; second at
+ * 0,bH->bW,2bH. We can render this with 0,0->1,1 mapping.
+ * Straight forward 0,0->1,1 mapping so we can reuse MB pos vectors.
+ */
+static const struct vlVertex2f *macroblock_chroma_422_texcoords = macroblock_verts;
+
+/*
+ * Represents texcoords for the above for rendering 4 chroma blocks.
+ * Same case as 4 luma blocks.
+ */
+static const struct vlVertex2f *macroblock_chroma_444_texcoords = macroblock_luma_texcoords;
+
+/*
+ * Used when rendering P and B macroblocks, multiplier is applied to the A channel,
+ * which is then added to the L channel, then the bias is subtracted from that to
+ * get back the differential. The differential is then added to the samples from the
+ * reference surface(s).
+ */
+static const struct vlFragmentShaderConsts fs_consts =
+{
+	{32767.0f / 255.0f, 32767.0f / 255.0f, 32767.0f / 255.0f, 0.0f},
+	{0.5f, 2.0f, 0.0f, 0.0f}
+};
+
+static int vlCreateVertexShaderIMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 50;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Luma texcoords
+	 * decl i2		; Chroma texcoords
+	 */
+	for (i = 0; i < 3; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0		; Scaling vector to scale unit rect to macroblock size
+	 * decl c1		; Translation vector to move macroblock into position
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Luma texcoords
+	 * decl o2		; Chroma texcoords
+	 */
+	for (i = 0; i < 3; i++)
+	{
+		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl t0 */
+	decl = vl_decl_temps(0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t0, i0, c0	; Scale unit rect to normalized MB size */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, c1	; Translate rect into position */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * mov o1, i1		; Move input luma texcoords to output
+	 * mov o2, i2		; Move input chroma texcoords to output
+	 */
+	for (i = 1; i < 3; ++i)
+	{
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	mc->i_vs = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShaderIMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0			; Texcoords for s0
+	 * decl i1			; Texcoords for s1, s2
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0			; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0, t1 */
+	decl = vl_decl_temps(0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl s0			; Sampler for luma texture
+	 * decl s1			; Sampler for chroma Cb texture
+	 * decl s2			; Sampler for chroma Cr texture
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		decl = vl_decl_samplers(i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header,max_tokens - ti);
+	}
+
+	/*
+	 * tex2d t1, i0, s0		; Read texel from luma texture
+	 * mov t0.x, t1.x		; Move luma sample into .x component
+	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
+	 * mov t0.y, t1.x		; Move Cb sample into .y component
+	 * tex2d t1, i1, s2		; Read texel from chroma Cr texture
+	 * mov t0.z, t1.x		; Move Cr sample into .z component
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i > 0 ? 1 : 0, TGSI_FILE_SAMPLER, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	}
+
+	/* mul o0, t0, c0		; Rescale texel to correct range */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	mc->i_fs = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateVertexShaderFramePMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Luma texcoords
+	 * decl i2		; Chroma texcoords
+	 */
+	for (i = 0; i < 3; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0		; Scaling vector to scale unit rect to macroblock size
+	 * decl c1		; Translation vector to move macroblock into position
+	 * decl c2		; Unused
+	 * decl c3		; Translation vector to move ref macroblock texcoords into position
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 3);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Luma texcoords
+	 * decl o2		; Chroma texcoords
+	 * decl o3		; Ref macroblock texcoords
+	 */
+	for (i = 0; i < 4; i++)
+	{
+		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl t0 */
+	decl = vl_decl_temps(0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t0, i0, c0	; Scale unit rect to normalized MB size */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, c1	; Translate rect into position */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * mov o1, i1		; Move input luma texcoords to output
+	 * mov o2, i2		; Move input chroma texcoords to output
+	 */
+	for (i = 1; i < 3; ++i)
+	{
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* add o3, t0, c3	; Translate rect into position on ref macroblock */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 3, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	mc->p_vs[0] = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateVertexShaderFieldPMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Luma texcoords
+	 * decl i2		; Chroma texcoords
+	 */
+	for (i = 0; i < 3; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration
+		(
+			&decl,
+			&tokens[ti],
+			header,
+			max_tokens - ti
+		);
+	}
+
+	/*
+	 * decl c0		; Scaling vector to scale unit rect to macroblock size
+	 * decl c1		; Translation vector to move macroblock into position
+	 * decl c2		; Denorm coefficients
+	 * decl c3		; Translation vector to move top field ref macroblock texcoords into position
+	 * decl c4		; Translation vector to move bottom field ref macroblock texcoords into position
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 4);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Luma texcoords
+	 * decl o2		; Chroma texcoords
+	 * decl o3		; Top field ref macroblock texcoords
+	 * decl o4		; Bottom field ref macroblock texcoords
+	 * decl o5		; Denormalized vertex pos
+	 */
+	for (i = 0; i < 6; i++)
+	{
+		decl = vl_decl_output((i == 0 || i == 5) ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl t0, t1 */
+	decl = vl_decl_temps(0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t0, i0, c0	; Scale unit rect to normalized MB size */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add t1, t0, c1	; Translate rect into position */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mov o0, t1		; Move vertex pos to output */
+	inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	mov o1, i1		; Move input luma texcoords to output
+	mov o2, i2		; Move input chroma texcoords to output
+	*/
+	for (i = 1; i < 3; ++i)
+	{
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* add o3, t0, c3	; Translate top field rect into position on ref macroblock
+	   add o4, t0, c4	; Translate bottom field rect into position on ref macroblock */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 3, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, i + 3);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* mul o5, t1, c2	; Denorm vertex pos */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 5, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 2);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	mc->p_vs[1] = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShaderFramePMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0			; Texcoords for s0
+	 * decl i1			; Texcoords for s1, s2
+	 * decl i2			; Texcoords for s3
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0			; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0, t1 */
+	decl = vl_decl_temps(0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl s0			; Sampler for luma texture
+	 * decl s1			; Sampler for chroma Cb texture
+	 * decl s2			; Sampler for chroma Cr texture
+	 * decl s3			; Sampler for ref surface texture
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		decl = vl_decl_samplers(i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * tex2d t1, i0, s0		; Read texel from luma texture
+	 * mov t0.x, t1.x		; Move luma sample into .x component
+	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
+	 * mov t0.y, t1.x		; Move Cb sample into .y component
+	 * tex2d t1, i1, s2		; Read texel from chroma Cr texture
+	 * mov t0.z, t1.x		; Move Cr sample into .z component
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i > 0 ? 1 : 0, TGSI_FILE_SAMPLER, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	}
+
+	/* mul t0, t0, c0		; Rescale texel to correct range */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* tex2d t1, i2, s3		; Read texel from ref macroblock */
+	inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, 2, TGSI_FILE_SAMPLER, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, t1		; Add ref and differential to form final output */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	mc->p_fs[0] = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShaderFieldPMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 200;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0			; Texcoords for s0
+	 * decl i1			; Texcoords for s1, s2
+	 * decl i2			; Texcoords for s3
+	 * decl i3			; Texcoords for s3
+	 * decl i4			; Denormalized vertex pos
+	 */
+	for (i = 0; i < 5; ++i)
+	{
+		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm
+	 * decl c1			; Constants 1/2 & 2 in .x, .y channels for Y-mod-2 top/bottom field selection
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0			; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0-t4 */
+	decl = vl_decl_temps(0, 4);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl s0			; Sampler for luma texture
+	 * decl s1			; Sampler for chroma Cb texture
+	 * decl s2			; Sampler for chroma Cr texture
+	 * decl s3			; Sampler for ref surface texture
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		decl = vl_decl_samplers(i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * tex2d t1, i0, s0		; Read texel from luma texture
+	 * mov t0.x, t1.x		; Move luma sample into .x component
+	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
+	 * mov t0.y, t1.x		; Move Cb sample into .y component
+	 * tex2d t1, i1, s2		; Read texel from chroma Cr texture
+	 * mov t0.z, t1.x		; Move Cr sample into .z component
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i > 0 ? 1 : 0, TGSI_FILE_SAMPLER, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	}
+
+	/* mul t0, t0, c0		; Rescale texel to correct range */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * tex2d t1, i2, s3		; Read texel from ref macroblock top field
+	 * tex2d t2, i3, s3		; Read texel from ref macroblock bottom field
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 2, TGSI_FILE_SAMPLER, 3);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* XXX: Pos values off by 0.5? */
+	/* sub t4, i4.y, c1.x		; Sub 0.5 from denormalized pos */
+	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_INPUT, 4, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t3, t4, c1.x		; Multiply pos Y-coord by 1/2 */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* floor t3, t3			; Get rid of fractional part */
+	inst = vl_inst2(TGSI_OPCODE_FLOOR, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t3, t3, c1.y		; Multiply by 2 */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* sub t3, t4, t3		; Subtract from original Y to get Y % 2 */
+	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* TODO: Move to conditional tex fetch on t3 instead of lerp */
+	/* lerp t1, t3, t1, t2		; Choose between top and bottom fields based on Y % 2 */
+	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, t1		; Add ref and differential to form final output */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	mc->p_fs[1] = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateVertexShaderFrameBMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Luma texcoords
+	 * decl i2		; Chroma texcoords
+	 */
+	for (i = 0; i < 3; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0		; Scaling vector to scale unit rect to macroblock size
+	 * decl c1		; Translation vector to move macroblock into position
+	 * decl c2		; Unused
+	 * decl c3		; Translation vector to move past ref macroblock texcoords into position
+	 * decl c4		; Unused
+	 * decl c5		; Translation vector to move future ref macroblock texcoords into position
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 5);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Luma texcoords
+	 * decl o2		; Chroma texcoords
+	 * decl o3		; Past ref macroblock texcoords
+	 * decl o4		; Future ref macroblock texcoords
+	 */
+	for (i = 0; i < 5; i++)
+	{
+		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl t0 */
+	decl = vl_decl_temps(0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t0, i0, c0	; Scale unit rect to normalized MB size */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, c1	; Translate rect into position */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * mov o1, i1		; Move input luma texcoords to output
+	 * mov o2, i2		; Move input chroma texcoords to output
+	 */
+	for (i = 1; i < 3; ++i)
+	{
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* add o3, t0, c3	; Translate rect into position on past ref macroblock
+	   add o4, t0, c5	; Translate rect into position on future ref macroblock */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 3, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, i * 2 + 3);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	mc->b_vs[0] = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateVertexShaderFieldBMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Luma texcoords
+	 * decl i2		; Chroma texcoords
+	 */
+	for (i = 0; i < 3; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0		; Scaling vector to scale unit rect to macroblock size
+	 * decl c1		; Translation vector to move macroblock into position
+	 * decl c2		; Denorm coefficients
+	 * decl c3		; Translation vector to move top field past ref macroblock texcoords into position
+	 * decl c4		; Translation vector to move bottom field past ref macroblock texcoords into position
+	 * decl c5		; Translation vector to move top field future ref macroblock texcoords into position
+	 * decl c6		; Translation vector to move bottom field future ref macroblock texcoords into position
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 6);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Luma texcoords
+	 * decl o2		; Chroma texcoords
+	 * decl o3		; Top field past ref macroblock texcoords
+	 * decl o4		; Bottom field past ref macroblock texcoords
+	 * decl o5		; Top field future ref macroblock texcoords
+	 * decl o6		; Bottom field future ref macroblock texcoords
+	 * decl o7		; Denormalized vertex pos
+	 */
+	for (i = 0; i < 8; i++)
+	{
+		decl = vl_decl_output((i == 0 || i == 7) ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl t0, t1 */
+	decl = vl_decl_temps(0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t0, i0, c0	; Scale unit rect to normalized MB size */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add t1, t0, c1	; Translate rect into position */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mov o0, t1		; Move vertex pos to output */
+	inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * mov o1, i1		; Move input luma texcoords to output
+	 * mov o2, i2		; Move input chroma texcoords to output
+	 */
+	for (i = 1; i < 3; ++i)
+	{
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * add o3, t0, c3	; Translate top field rect into position on past ref macroblock
+	 * add o4, t0, c4	; Translate bottom field rect into position on past ref macroblock
+	 * add o5, t0, c5	; Translate top field rect into position on future ref macroblock
+	 * add o6, t0, c6	; Translate bottom field rect into position on future ref macroblock
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 3, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, i + 3);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* mul o7, t1, c2	; Denorm vertex pos */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 7, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 2);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	mc->b_vs[1] = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShaderFrameBMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0			; Texcoords for s0
+	 * decl i1			; Texcoords for s1, s2
+	 * decl i2			; Texcoords for s3
+	 * decl i3			; Texcoords for s4
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm
+	 * decl c1			; Constant 1/2 in .x channel to use as weight to blend past and future texels
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0			; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0-t2 */
+	decl = vl_decl_temps(0, 2);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl s0			; Sampler for luma texture
+	 * decl s1			; Sampler for chroma Cb texture
+	 * decl s2			; Sampler for chroma Cr texture
+	 * decl s3			; Sampler for past ref surface texture
+	 * decl s4			; Sampler for future ref surface texture
+	 */
+	for (i = 0; i < 5; ++i)
+	{
+		decl = vl_decl_samplers(i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * tex2d t1, i0, s0		; Read texel from luma texture
+	 * mov t0.x, t1.x		; Move luma sample into .x component
+	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
+	 * mov t0.y, t1.x		; Move Cb sample into .y component
+	 * tex2d t1, i1, s2		; Read texel from chroma Cr texture
+	 * mov t0.z, t1.x		; Move Cr sample into .z component
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i > 0 ? 1 : 0, TGSI_FILE_SAMPLER, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	}
+
+	/* mul t0, t0, c0		; Rescale texel to correct range */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * tex2d t1, i2, s3		; Read texel from past ref macroblock
+	 * tex2d t2, i3, s4		; Read texel from future ref macroblock
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 2, TGSI_FILE_SAMPLER, i + 3);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* lerp t1, c1.x, t1, t2	; Blend past and future texels */
+	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 1, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, t1		; Add past/future ref and differential to form final output */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	mc->b_fs[0] = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShaderFieldBMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 200;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0			; Texcoords for s0
+	 * decl i1			; Texcoords for s1, s2
+	 * decl i2			; Texcoords for s3
+	 * decl i3			; Texcoords for s3
+	 * decl i4			; Texcoords for s4
+	 * decl i5			; Texcoords for s4
+	 * decl i6			; Denormalized vertex pos
+	 */
+	for (i = 0; i < 7; ++i)
+	{
+		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm
+	 * decl c1			; Constants 1/2 & 2 in .x, .y channels to use as weight to blend past and future texels
+	 *				; and for Y-mod-2 top/bottom field selection
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0			; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0-t5 */
+	decl = vl_decl_temps(0, 5);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl s0			; Sampler for luma texture
+	 * decl s1			; Sampler for chroma Cb texture
+	 * decl s2			; Sampler for chroma Cr texture
+	 * decl s3			; Sampler for past ref surface texture
+	 * decl s4			; Sampler for future ref surface texture
+	 */
+	for (i = 0; i < 5; ++i)
+	{
+		decl = vl_decl_samplers(i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * tex2d t1, i0, s0		; Read texel from luma texture
+	 * mov t0.x, t1.x		; Move luma sample into .x component
+	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
+	 * mov t0.y, t1.x		; Move Cb sample into .y component
+	 * tex2d t1, i1, s2		; Read texel from chroma Cr texture
+	 * mov t0.z, t1.x		; Move Cr sample into .z component
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i > 0 ? 1 : 0, TGSI_FILE_SAMPLER, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	}
+
+	/* mul t0, t0, c0		; Rescale texel to correct range */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* XXX: Pos values off by 0.5? */
+	/* sub t4, i6.y, c1.x		; Sub 0.5 from denormalized pos */
+	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_INPUT, 6, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t3, t4, c1.x		; Multiply pos Y-coord by 1/2 */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* floor t3, t3			; Get rid of fractional part */
+	inst = vl_inst2(TGSI_OPCODE_FLOOR, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t3, t3, c1.y		; Multiply by 2 */
+	inst = vl_inst3( TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* sub t3, t4, t3		; Subtract from original Y to get Y % 2 */
+	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * tex2d t1, i2, s3		; Read texel from past ref macroblock top field
+	 * tex2d t2, i3, s3		; Read texel from past ref macroblock bottom field
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 2, TGSI_FILE_SAMPLER, 3);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* TODO: Move to conditional tex fetch on t3 instead of lerp */
+	/* lerp t1, t3, t1, t2		; Choose between top and bottom fields based on Y % 2 */
+	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * tex2d t4, i4, s4		; Read texel from future ref macroblock top field
+	 * tex2d t5, i5, s4		; Read texel from future ref macroblock bottom field
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 4, TGSI_FILE_INPUT, i + 4, TGSI_FILE_SAMPLER, 4);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* TODO: Move to conditional tex fetch on t3 instead of lerp */
+	/* lerp t2, t3, t4, t5		; Choose between top and bottom fields based on Y % 2 */
+	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 2, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 5);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* lerp t1, c1.x, t1, t2	; Blend past and future texels */
+	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 1, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, t1		; Add past/future ref and differential to form final output */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	mc->b_fs[1] = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateDataBufs
+(
+	struct vlR16SnormMC *mc
+)
+{
+	struct pipe_context	*pipe;
+	unsigned int		i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+
+	/* Create our vertex buffer and vertex buffer element */
+	mc->vertex_bufs[0].pitch = sizeof(struct vlVertex2f);
+	mc->vertex_bufs[0].max_index = 23;
+	mc->vertex_bufs[0].buffer_offset = 0;
+	mc->vertex_bufs[0].buffer = pipe->winsys->buffer_create
+	(
+		pipe->winsys,
+		1,
+		PIPE_BUFFER_USAGE_VERTEX,
+		sizeof(struct vlVertex2f) * 24
+	);
+
+	mc->vertex_elems[0].src_offset = 0;
+	mc->vertex_elems[0].vertex_buffer_index = 0;
+	mc->vertex_elems[0].nr_components = 2;
+	mc->vertex_elems[0].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+	/* Create our texcoord buffers and texcoord buffer elements */
+	for (i = 1; i < 3; ++i)
+	{
+		mc->vertex_bufs[i].pitch = sizeof(struct vlVertex2f);
+		mc->vertex_bufs[i].max_index = 23;
+		mc->vertex_bufs[i].buffer_offset = 0;
+		mc->vertex_bufs[i].buffer = pipe->winsys->buffer_create
+		(
+			pipe->winsys,
+			1,
+			PIPE_BUFFER_USAGE_VERTEX,
+			sizeof(struct vlVertex2f) * 24
+		);
+
+		mc->vertex_elems[i].src_offset = 0;
+		mc->vertex_elems[i].vertex_buffer_index = i;
+		mc->vertex_elems[i].nr_components = 2;
+		mc->vertex_elems[i].src_format = PIPE_FORMAT_R32G32_FLOAT;
+	}
+
+	/* Fill buffers */
+	memcpy
+	(
+		pipe->winsys->buffer_map(pipe->winsys, mc->vertex_bufs[0].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+		macroblock_verts,
+		sizeof(struct vlVertex2f) * 24
+	);
+	memcpy
+	(
+		pipe->winsys->buffer_map(pipe->winsys, mc->vertex_bufs[1].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+		macroblock_luma_texcoords,
+		sizeof(struct vlVertex2f) * 24
+	);
+	/* TODO: Accomodate 422, 444 */
+	memcpy
+	(
+		pipe->winsys->buffer_map(pipe->winsys, mc->vertex_bufs[2].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+		macroblock_chroma_420_texcoords,
+		sizeof(struct vlVertex2f) * 24
+	);
+
+	for (i = 0; i < 3; ++i)
+		pipe->winsys->buffer_unmap(pipe->winsys, mc->vertex_bufs[i].buffer);
+
+	/* Create our constant buffer */
+	mc->vs_const_buf.size = sizeof(struct vlVertexShaderConsts);
+	mc->vs_const_buf.buffer = pipe->winsys->buffer_create
+	(
+		pipe->winsys,
+		1,
+		PIPE_BUFFER_USAGE_CONSTANT,
+		mc->vs_const_buf.size
+	);
+
+	mc->fs_const_buf.size = sizeof(struct vlFragmentShaderConsts);
+	mc->fs_const_buf.buffer = pipe->winsys->buffer_create
+	(
+		pipe->winsys,
+		1,
+		PIPE_BUFFER_USAGE_CONSTANT,
+		mc->fs_const_buf.size
+	);
+
+	memcpy
+	(
+		pipe->winsys->buffer_map(pipe->winsys, mc->fs_const_buf.buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+		&fs_consts,
+		sizeof(struct vlFragmentShaderConsts)
+	);
+
+	pipe->winsys->buffer_unmap(pipe->winsys, mc->fs_const_buf.buffer);
+
+	return 0;
+}
+
+static int vlInit
+(
+	struct vlR16SnormMC *mc
+)
+{
+	struct pipe_context		*pipe;
+	struct pipe_sampler_state	sampler;
+	struct pipe_texture		template;
+	unsigned int			filters[5];
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+
+	/* For MC we render to textures, which are rounded up to nearest POT */
+	mc->viewport.scale[0] = vlRoundUpPOT(mc->video_width);
+	mc->viewport.scale[1] = vlRoundUpPOT(mc->video_height);
+	mc->viewport.scale[2] = 1;
+	mc->viewport.scale[3] = 1;
+	mc->viewport.translate[0] = 0;
+	mc->viewport.translate[1] = 0;
+	mc->viewport.translate[2] = 0;
+	mc->viewport.translate[3] = 0;
+
+	mc->render_target.width = vlRoundUpPOT(mc->video_width);
+	mc->render_target.height = vlRoundUpPOT(mc->video_height);
+	mc->render_target.num_cbufs = 1;
+	/* FB for MC stage is a vlSurface, set in vlSetRenderSurface() */
+	mc->render_target.zsbuf = NULL;
+
+	filters[0] = PIPE_TEX_FILTER_NEAREST;
+	filters[1] = mc->video_format == vlFormatYCbCr444 ? PIPE_TEX_FILTER_NEAREST : PIPE_TEX_FILTER_LINEAR;
+	filters[2] = mc->video_format == vlFormatYCbCr444 ? PIPE_TEX_FILTER_NEAREST : PIPE_TEX_FILTER_LINEAR;
+	filters[3] = PIPE_TEX_FILTER_LINEAR;
+	filters[4] = PIPE_TEX_FILTER_LINEAR;
+
+	for (i = 0; i < 5; ++i)
+	{
+		sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+		sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+		sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+		sampler.min_img_filter = filters[i];
+		sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+		sampler.mag_img_filter = filters[i];
+		sampler.compare_mode = PIPE_TEX_COMPARE_NONE;
+		sampler.compare_func = PIPE_FUNC_ALWAYS;
+		sampler.normalized_coords = 1;
+		/*sampler.prefilter = ;*/
+		/*sampler.shadow_ambient = ;*/
+		/*sampler.lod_bias = ;*/
+		sampler.min_lod = 0;
+		/*sampler.max_lod = ;*/
+		/*sampler.border_color[i] = ;*/
+		/*sampler.max_anisotropy = ;*/
+		mc->samplers[i] = pipe->create_sampler_state(pipe, &sampler);
+	}
+
+	memset(&template, 0, sizeof(struct pipe_texture));
+	template.target = PIPE_TEXTURE_2D;
+	template.format = PIPE_FORMAT_R16_SNORM;
+	template.last_level = 0;
+	template.width[0] = 8;
+	template.height[0] = 8 * 4;
+	template.depth[0] = 1;
+	template.compressed = 0;
+	pf_get_block(template.format, &template.block);
+
+	for (i = 0; i < NUM_BUFS; ++i)
+		mc->textures[i][0] = pipe->screen->texture_create(pipe->screen, &template);
+
+	if (mc->video_format == vlFormatYCbCr420)
+		template.height[0] = 8;
+	else if (mc->video_format == vlFormatYCbCr422)
+		template.height[0] = 8 * 2;
+	else if (mc->video_format == vlFormatYCbCr444)
+		template.height[0] = 8 * 4;
+	else
+		assert(0);
+
+	for (i = 0; i < NUM_BUFS; ++i)
+	{
+		mc->textures[i][1] = pipe->screen->texture_create(pipe->screen, &template);
+		mc->textures[i][2] = pipe->screen->texture_create(pipe->screen, &template);
+	}
+
+	/* textures[3] & textures[4] are assigned from vlSurfaces for P and B macroblocks at render time */
+
+	vlCreateVertexShaderIMB(mc);
+	vlCreateFragmentShaderIMB(mc);
+	vlCreateVertexShaderFramePMB(mc);
+	vlCreateVertexShaderFieldPMB(mc);
+	vlCreateFragmentShaderFramePMB(mc);
+	vlCreateFragmentShaderFieldPMB(mc);
+	vlCreateVertexShaderFrameBMB(mc);
+	vlCreateVertexShaderFieldBMB(mc);
+	vlCreateFragmentShaderFrameBMB(mc);
+	vlCreateFragmentShaderFieldBMB(mc);
+	vlCreateDataBufs(mc);
+
+	return 0;
+}
+
+int vlCreateR16SNormMC
+(
+	struct pipe_context *pipe,
+	unsigned int video_width,
+	unsigned int video_height,
+	enum vlFormat video_format,
+	struct vlRender **render
+)
+{
+	struct vlR16SnormMC *mc;
+
+	assert(pipe);
+	assert(render);
+
+	mc = calloc(1, sizeof(struct vlR16SnormMC));
+
+	mc->base.vlBegin = &vlBegin;
+	mc->base.vlRenderMacroBlocksMpeg2 = &vlRenderMacroBlocksMpeg2R16Snorm;
+	mc->base.vlEnd = &vlEnd;
+	mc->base.vlFlush = &vlFlush;
+	mc->base.vlDestroy = &vlDestroy;
+	mc->pipe = pipe;
+	mc->video_width = video_width;
+	mc->video_height = video_height;
+	mc->cur_buf = 0;
+
+	vlInit(mc);
+
+	*render = &mc->base;
+
+	return 0;
+}
diff --git a/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.h b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.h
new file mode 100644
index 00000000000..9842926bf70
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.h
@@ -0,0 +1,18 @@
+#ifndef vl_r16snorm_mc_h
+#define vl_r16snorm_mc_h
+
+#include "vl_types.h"
+
+struct pipe_context;
+struct vlRender;
+
+int vlCreateR16SNormMC
+(
+	struct pipe_context *pipe,
+	unsigned int video_width,
+	unsigned int video_height,
+	enum vlFormat video_format,
+	struct vlRender **render
+);
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.c b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.c
new file mode 100644
index 00000000000..e7a070ef4df
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.c
@@ -0,0 +1,2321 @@
+#define VL_INTERNAL
+#include "vl_r16snorm_mc_buf.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <pipe/p_context.h>
+#include <pipe/p_winsys.h>
+#include <pipe/p_screen.h>
+#include <pipe/p_state.h>
+#include <pipe/p_inlines.h>
+#include <tgsi/tgsi_parse.h>
+#include <tgsi/tgsi_build.h>
+#include <util/u_math.h>
+#include "vl_render.h"
+#include "vl_shader_build.h"
+#include "vl_surface.h"
+#include "vl_util.h"
+#include "vl_types.h"
+#include "vl_defs.h"
+
+/*
+ * TODO: Dynamically determine number of buf sets to use, based on
+ * video size and available mem, since we can easily run out of memory
+ * for high res videos.
+ * Note: Destroying previous frame's buffers and creating new ones
+ * doesn't work, since the buffer are not actually destroyed until their
+ * fence is signalled, and if we render fast enough we will create faster
+ * than we destroy.
+ */
+#define NUM_BUF_SETS 4	/* Number of rotating buffer sets to use */
+
+enum vlMacroBlockTypeEx
+{
+	vlMacroBlockExTypeIntra,
+	vlMacroBlockExTypeFwdPredictedFrame,
+	vlMacroBlockExTypeFwdPredictedField,
+	vlMacroBlockExTypeBkwdPredictedFrame,
+	vlMacroBlockExTypeBkwdPredictedField,
+	vlMacroBlockExTypeBiPredictedFrame,
+	vlMacroBlockExTypeBiPredictedField,
+
+	vlNumMacroBlockExTypes
+};
+
+struct vlVertexShaderConsts
+{
+	struct vlVertex4f denorm;
+};
+
+struct vlFragmentShaderConsts
+{
+	struct vlVertex4f multiplier;
+	struct vlVertex4f div;
+};
+
+struct vlR16SnormBufferedMC
+{
+	struct vlRender				base;
+
+	unsigned int				picture_width, picture_height;
+	enum vlFormat				picture_format;
+
+	unsigned int				cur_buf;
+	struct vlSurface			*buffered_surface;
+	struct vlSurface			*past_surface, *future_surface;
+	struct vlVertex2f			surface_tex_inv_size;
+	struct vlVertex2f			zero_block[3];
+	unsigned int				num_macroblocks;
+	struct vlMpeg2MacroBlock		*macroblocks;
+
+	struct pipe_context			*pipe;
+	struct pipe_viewport_state		viewport;
+	struct pipe_framebuffer_state		render_target;
+	struct pipe_sampler_state		*samplers[5];
+	struct pipe_texture			*textures[NUM_BUF_SETS][5];
+	void					*i_vs, *p_vs[2], *b_vs[2];
+	void					*i_fs, *p_fs[2], *b_fs[2];
+	struct pipe_vertex_buffer 		vertex_bufs[NUM_BUF_SETS][3];
+	struct pipe_vertex_element		vertex_elems[8];
+	struct pipe_constant_buffer		vs_const_buf, fs_const_buf;
+};
+
+static int vlBegin
+(
+	struct vlRender *render
+)
+{
+	assert(render);
+
+	return 0;
+}
+
+static inline int vlGrabFrameCodedBlock(short *src, short *dst, unsigned int dst_pitch)
+{
+	unsigned int y;
+
+	for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
+		memcpy
+		(
+			dst + y * dst_pitch,
+			src + y * VL_BLOCK_WIDTH,
+			VL_BLOCK_WIDTH * 2
+		);
+
+	return 0;
+}
+
+static inline int vlGrabFieldCodedBlock(short *src, short *dst, unsigned int dst_pitch)
+{
+	unsigned int y;
+
+	for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
+		memcpy
+		(
+			dst + y * dst_pitch * 2,
+			src + y * VL_BLOCK_WIDTH,
+			VL_BLOCK_WIDTH * 2
+		);
+
+	return 0;
+}
+
+static inline int vlGrabNoBlock(short *dst, unsigned int dst_pitch)
+{
+	unsigned int y;
+
+	for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
+		memset
+		(
+			dst + y * dst_pitch,
+			0,
+			VL_BLOCK_WIDTH * 2
+		);
+
+	return 0;
+}
+
+static inline int vlGrabBlocks
+(
+	struct vlR16SnormBufferedMC *mc,
+	unsigned int mbx,
+	unsigned int mby,
+	enum vlDCTType dct_type,
+	unsigned int coded_block_pattern,
+	short *blocks
+)
+{
+	struct pipe_surface	*tex_surface;
+	short			*texels;
+	unsigned int		tex_pitch;
+	unsigned int		x, y, tb = 0, sb = 0;
+	unsigned int		mbpx = mbx * VL_MACROBLOCK_WIDTH, mbpy = mby * VL_MACROBLOCK_HEIGHT;
+
+	assert(mc);
+	assert(blocks);
+
+	tex_surface = mc->pipe->screen->get_tex_surface
+	(
+		mc->pipe->screen,
+		mc->textures[mc->cur_buf % NUM_BUF_SETS][0],
+		0, 0, 0, PIPE_BUFFER_USAGE_CPU_WRITE
+	);
+
+	texels = pipe_surface_map(tex_surface, PIPE_BUFFER_USAGE_CPU_WRITE);
+	tex_pitch = tex_surface->stride / tex_surface->block.size;
+
+	texels += mbpy * tex_pitch + mbpx;
+
+	for (y = 0; y < 2; ++y)
+	{
+		for (x = 0; x < 2; ++x, ++tb)
+		{
+			if ((coded_block_pattern >> (5 - tb)) & 1)
+			{
+				short *cur_block = blocks + sb * VL_BLOCK_WIDTH * VL_BLOCK_HEIGHT;
+
+				if (dct_type == vlDCTTypeFrameCoded)
+				{
+					vlGrabFrameCodedBlock
+					(
+						cur_block,
+						texels + y * tex_pitch * VL_BLOCK_HEIGHT + x * VL_BLOCK_WIDTH,
+						tex_pitch
+					);
+				}
+				else
+				{
+					vlGrabFieldCodedBlock
+					(
+						cur_block,
+						texels + y * tex_pitch + x * VL_BLOCK_WIDTH,
+						tex_pitch
+					);
+				}
+
+				++sb;
+			}
+			else if (mc->zero_block[0].x < 0.0f)
+			{
+				vlGrabNoBlock(texels + y * tex_pitch * VL_BLOCK_HEIGHT + x * VL_BLOCK_WIDTH, tex_pitch);
+
+				mc->zero_block[0].x = (mbpx + x * 8) * mc->surface_tex_inv_size.x;
+				mc->zero_block[0].y = (mbpy + y * 8) * mc->surface_tex_inv_size.y;
+			}
+		}
+	}
+
+	pipe_surface_unmap(tex_surface);
+
+	/* TODO: Implement 422, 444 */
+	mbpx >>= 1;
+	mbpy >>= 1;
+
+	for (tb = 0; tb < 2; ++tb)
+	{
+		tex_surface = mc->pipe->screen->get_tex_surface
+		(
+			mc->pipe->screen,
+			mc->textures[mc->cur_buf % NUM_BUF_SETS][tb + 1],
+			0, 0, 0, PIPE_BUFFER_USAGE_CPU_WRITE
+		);
+
+		texels = pipe_surface_map(tex_surface, PIPE_BUFFER_USAGE_CPU_WRITE);
+		tex_pitch = tex_surface->stride / tex_surface->block.size;
+
+		texels += mbpy * tex_pitch + mbpx;
+
+		if ((coded_block_pattern >> (1 - tb)) & 1)
+		{
+			short *cur_block = blocks + sb * VL_BLOCK_WIDTH * VL_BLOCK_HEIGHT;
+
+			vlGrabFrameCodedBlock
+			(
+				cur_block,
+				texels,
+				tex_pitch
+			);
+
+			++sb;
+		}
+		else if (mc->zero_block[tb + 1].x < 0.0f)
+		{
+			vlGrabNoBlock(texels, tex_pitch);
+
+			mc->zero_block[tb + 1].x = (mbpx << 1) * mc->surface_tex_inv_size.x;
+			mc->zero_block[tb + 1].y = (mbpy << 1) * mc->surface_tex_inv_size.y;
+		}
+
+		pipe_surface_unmap(tex_surface);
+	}
+
+	return 0;
+}
+
+static inline enum vlMacroBlockTypeEx vlGetMacroBlockTypeEx(struct vlMpeg2MacroBlock *mb)
+{
+	assert(mb);
+
+	switch (mb->mb_type)
+	{
+		case vlMacroBlockTypeIntra:
+			return vlMacroBlockExTypeIntra;
+		case vlMacroBlockTypeFwdPredicted:
+			return mb->mo_type == vlMotionTypeFrame ?
+				vlMacroBlockExTypeFwdPredictedFrame : vlMacroBlockExTypeFwdPredictedField;
+		case vlMacroBlockTypeBkwdPredicted:
+			return mb->mo_type == vlMotionTypeFrame ?
+				vlMacroBlockExTypeBkwdPredictedFrame : vlMacroBlockExTypeBkwdPredictedField;
+		case vlMacroBlockTypeBiPredicted:
+			return mb->mo_type == vlMotionTypeFrame ?
+				vlMacroBlockExTypeBiPredictedFrame : vlMacroBlockExTypeBiPredictedField;
+		default:
+			assert(0);
+	}
+
+	/* Unreachable */
+	return -1;
+}
+
+static inline int vlGrabMacroBlock
+(
+	struct vlR16SnormBufferedMC *mc,
+	struct vlMpeg2MacroBlock *macroblock
+)
+{
+	assert(mc);
+	assert(macroblock);
+
+	mc->macroblocks[mc->num_macroblocks].mbx = macroblock->mbx;
+	mc->macroblocks[mc->num_macroblocks].mby = macroblock->mby;
+	mc->macroblocks[mc->num_macroblocks].mb_type = macroblock->mb_type;
+	mc->macroblocks[mc->num_macroblocks].mo_type = macroblock->mo_type;
+	mc->macroblocks[mc->num_macroblocks].dct_type = macroblock->dct_type;
+	mc->macroblocks[mc->num_macroblocks].PMV[0][0][0] = macroblock->PMV[0][0][0];
+	mc->macroblocks[mc->num_macroblocks].PMV[0][0][1] = macroblock->PMV[0][0][1];
+	mc->macroblocks[mc->num_macroblocks].PMV[0][1][0] = macroblock->PMV[0][1][0];
+	mc->macroblocks[mc->num_macroblocks].PMV[0][1][1] = macroblock->PMV[0][1][1];
+	mc->macroblocks[mc->num_macroblocks].PMV[1][0][0] = macroblock->PMV[1][0][0];
+	mc->macroblocks[mc->num_macroblocks].PMV[1][0][1] = macroblock->PMV[1][0][1];
+	mc->macroblocks[mc->num_macroblocks].PMV[1][1][0] = macroblock->PMV[1][1][0];
+	mc->macroblocks[mc->num_macroblocks].PMV[1][1][1] = macroblock->PMV[1][1][1];
+	mc->macroblocks[mc->num_macroblocks].cbp = macroblock->cbp;
+	mc->macroblocks[mc->num_macroblocks].blocks = macroblock->blocks;
+
+	vlGrabBlocks
+	(
+		mc,
+		macroblock->mbx,
+		macroblock->mby,
+		macroblock->dct_type,
+		macroblock->cbp,
+		macroblock->blocks
+	);
+
+	mc->num_macroblocks++;
+
+	return 0;
+}
+
+#define SET_BLOCK(vb, cbp, mbx, mby, unitx, unity, ofsx, ofsy, hx, hy, lm, cbm, crm, zb)					\
+	(vb)[0].pos.x = (mbx) * (unitx) + (ofsx);		(vb)[0].pos.y = (mby) * (unity) + (ofsy);			\
+	(vb)[1].pos.x = (mbx) * (unitx) + (ofsx);		(vb)[1].pos.y = (mby) * (unity) + (ofsy) + (hy);		\
+	(vb)[2].pos.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].pos.y = (mby) * (unity) + (ofsy);			\
+	(vb)[3].pos.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].pos.y = (mby) * (unity) + (ofsy);			\
+	(vb)[4].pos.x = (mbx) * (unitx) + (ofsx);		(vb)[4].pos.y = (mby) * (unity) + (ofsy) + (hy);		\
+	(vb)[5].pos.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].pos.y = (mby) * (unity) + (ofsy) + (hy);		\
+																\
+	if ((cbp) & (lm))													\
+	{															\
+		(vb)[0].luma_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[0].luma_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[1].luma_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[1].luma_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[2].luma_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].luma_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[3].luma_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].luma_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[4].luma_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[4].luma_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[5].luma_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].luma_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+	}															\
+	else															\
+	{															\
+		(vb)[0].luma_tc.x = (zb)[0].x;		(vb)[0].luma_tc.y = (zb)[0].y;						\
+		(vb)[1].luma_tc.x = (zb)[0].x;		(vb)[1].luma_tc.y = (zb)[0].y + (hy);					\
+		(vb)[2].luma_tc.x = (zb)[0].x + (hx);	(vb)[2].luma_tc.y = (zb)[0].y;						\
+		(vb)[3].luma_tc.x = (zb)[0].x + (hx);	(vb)[3].luma_tc.y = (zb)[0].y;						\
+		(vb)[4].luma_tc.x = (zb)[0].x;		(vb)[4].luma_tc.y = (zb)[0].y + (hy);					\
+		(vb)[5].luma_tc.x = (zb)[0].x + (hx);	(vb)[5].luma_tc.y = (zb)[0].y + (hy);					\
+	}															\
+																\
+	if ((cbp) & (cbm))													\
+	{															\
+		(vb)[0].cb_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[0].cb_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[1].cb_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[1].cb_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[2].cb_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].cb_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[3].cb_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].cb_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[4].cb_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[4].cb_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[5].cb_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].cb_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+	}															\
+	else															\
+	{															\
+		(vb)[0].cb_tc.x = (zb)[1].x;		(vb)[0].cb_tc.y = (zb)[1].y;						\
+		(vb)[1].cb_tc.x = (zb)[1].x;		(vb)[1].cb_tc.y = (zb)[1].y + (hy);					\
+		(vb)[2].cb_tc.x = (zb)[1].x + (hx);	(vb)[2].cb_tc.y = (zb)[1].y;						\
+		(vb)[3].cb_tc.x = (zb)[1].x + (hx);	(vb)[3].cb_tc.y = (zb)[1].y;						\
+		(vb)[4].cb_tc.x = (zb)[1].x;		(vb)[4].cb_tc.y = (zb)[1].y + (hy);					\
+		(vb)[5].cb_tc.x = (zb)[1].x + (hx);	(vb)[5].cb_tc.y = (zb)[1].y + (hy);					\
+	}															\
+																\
+	if ((cbp) & (crm))													\
+	{															\
+		(vb)[0].cr_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[0].cr_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[1].cr_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[1].cr_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[2].cr_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].cr_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[3].cr_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].cr_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[4].cr_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[4].cr_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[5].cr_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].cr_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+	}															\
+	else															\
+	{															\
+		(vb)[0].cr_tc.x = (zb)[2].x;		(vb)[0].cr_tc.y = (zb)[2].y;						\
+		(vb)[1].cr_tc.x = (zb)[2].x;		(vb)[1].cr_tc.y = (zb)[2].y + (hy);					\
+		(vb)[2].cr_tc.x = (zb)[2].x + (hx);	(vb)[2].cr_tc.y = (zb)[2].y;						\
+		(vb)[3].cr_tc.x = (zb)[2].x + (hx);	(vb)[3].cr_tc.y = (zb)[2].y;						\
+		(vb)[4].cr_tc.x = (zb)[2].x;		(vb)[4].cr_tc.y = (zb)[2].y + (hy);					\
+		(vb)[5].cr_tc.x = (zb)[2].x + (hx);	(vb)[5].cr_tc.y = (zb)[2].y + (hy);					\
+	}
+
+static inline int vlGrabMacroBlockVB
+(
+	struct vlR16SnormBufferedMC *mc,
+	struct vlMpeg2MacroBlock *macroblock,
+	unsigned int pos
+)
+{
+	struct vlVertex2f	mo_vec[2];
+	unsigned int		i;
+
+	assert(mc);
+	assert(macroblock);
+
+	switch (macroblock->mb_type)
+	{
+		case vlMacroBlockTypeBiPredicted:
+		{
+			struct vlVertex2f *vb;
+
+			vb = (struct vlVertex2f*)mc->pipe->winsys->buffer_map
+			(
+				mc->pipe->winsys,
+				mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][2].buffer,
+				PIPE_BUFFER_USAGE_CPU_WRITE
+			) + pos * 2 * 24;
+
+			mo_vec[0].x = macroblock->PMV[0][1][0] * 0.5f * mc->surface_tex_inv_size.x;
+			mo_vec[0].y = macroblock->PMV[0][1][1] * 0.5f * mc->surface_tex_inv_size.y;
+
+			if (macroblock->mo_type == vlMotionTypeFrame)
+			{
+				for (i = 0; i < 24 * 2; i += 2)
+				{
+					vb[i].x = mo_vec[0].x;
+					vb[i].y = mo_vec[0].y;
+				}
+			}
+			else
+			{
+				mo_vec[1].x = macroblock->PMV[1][1][0] * 0.5f * mc->surface_tex_inv_size.x;
+				mo_vec[1].y = macroblock->PMV[1][1][1] * 0.5f * mc->surface_tex_inv_size.y;
+
+				for (i = 0; i < 24 * 2; i += 2)
+				{
+					vb[i].x = mo_vec[0].x;
+					vb[i].y = mo_vec[0].y;
+					vb[i + 1].x = mo_vec[1].x;
+					vb[i + 1].y = mo_vec[1].y;
+				}
+			}
+
+			mc->pipe->winsys->buffer_unmap(mc->pipe->winsys, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][2].buffer);
+
+			/* fall-through */
+		}
+		case vlMacroBlockTypeFwdPredicted:
+		case vlMacroBlockTypeBkwdPredicted:
+		{
+			struct vlVertex2f *vb;
+
+			vb = (struct vlVertex2f*)mc->pipe->winsys->buffer_map
+			(
+				mc->pipe->winsys,
+				mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][1].buffer,
+				PIPE_BUFFER_USAGE_CPU_WRITE
+			) + pos * 2 * 24;
+
+			if (macroblock->mb_type == vlMacroBlockTypeBkwdPredicted)
+			{
+				mo_vec[0].x = macroblock->PMV[0][1][0] * 0.5f * mc->surface_tex_inv_size.x;
+				mo_vec[0].y = macroblock->PMV[0][1][1] * 0.5f * mc->surface_tex_inv_size.y;
+
+				if (macroblock->mo_type == vlMotionTypeField)
+				{
+					mo_vec[1].x = macroblock->PMV[1][1][0] * 0.5f * mc->surface_tex_inv_size.x;
+					mo_vec[1].y = macroblock->PMV[1][1][1] * 0.5f * mc->surface_tex_inv_size.y;
+				}
+			}
+			else
+			{
+				mo_vec[0].x = macroblock->PMV[0][0][0] * 0.5f * mc->surface_tex_inv_size.x;
+				mo_vec[0].y = macroblock->PMV[0][0][1] * 0.5f * mc->surface_tex_inv_size.y;
+
+				if (macroblock->mo_type == vlMotionTypeField)
+				{
+					mo_vec[1].x = macroblock->PMV[1][0][0] * 0.5f * mc->surface_tex_inv_size.x;
+					mo_vec[1].y = macroblock->PMV[1][0][1] * 0.5f * mc->surface_tex_inv_size.y;
+				}
+			}
+
+			if (macroblock->mo_type == vlMotionTypeFrame)
+			{
+				for (i = 0; i < 24 * 2; i += 2)
+				{
+					vb[i].x = mo_vec[0].x;
+					vb[i].y = mo_vec[0].y;
+				}
+			}
+			else
+			{
+				for (i = 0; i < 24 * 2; i += 2)
+				{
+					vb[i].x = mo_vec[0].x;
+					vb[i].y = mo_vec[0].y;
+					vb[i + 1].x = mo_vec[1].x;
+					vb[i + 1].y = mo_vec[1].y;
+				}
+			}
+
+			mc->pipe->winsys->buffer_unmap(mc->pipe->winsys, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][1].buffer);
+
+			/* fall-through */
+		}
+		case vlMacroBlockTypeIntra:
+		{
+			const struct vlVertex2f	unit =
+			{
+				mc->surface_tex_inv_size.x * VL_MACROBLOCK_WIDTH,
+				mc->surface_tex_inv_size.y * VL_MACROBLOCK_HEIGHT
+			};
+			const struct vlVertex2f half =
+			{
+				mc->surface_tex_inv_size.x * (VL_MACROBLOCK_WIDTH / 2),
+				mc->surface_tex_inv_size.y * (VL_MACROBLOCK_HEIGHT / 2)
+			};
+
+			struct vlMacroBlockVertexStream0
+			{
+				struct vlVertex2f pos;
+				struct vlVertex2f luma_tc;
+				struct vlVertex2f cb_tc;
+				struct vlVertex2f cr_tc;
+			} *vb;
+
+			vb = (struct vlMacroBlockVertexStream0*)mc->pipe->winsys->buffer_map
+			(
+				mc->pipe->winsys,
+				mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][0].buffer,
+				PIPE_BUFFER_USAGE_CPU_WRITE
+			) + pos * 24;
+
+			SET_BLOCK
+			(
+				vb,
+				macroblock->cbp, macroblock->mbx, macroblock->mby,
+				unit.x, unit.y, 0, 0, half.x, half.y,
+				32, 2, 1, mc->zero_block
+			);
+
+			SET_BLOCK
+			(
+				vb + 6,
+				macroblock->cbp, macroblock->mbx, macroblock->mby,
+				unit.x, unit.y, half.x, 0, half.x, half.y,
+				16, 2, 1, mc->zero_block
+			);
+
+			SET_BLOCK
+			(
+				vb + 12,
+				macroblock->cbp, macroblock->mbx, macroblock->mby,
+				unit.x, unit.y, 0, half.y, half.x, half.y,
+				8, 2, 1, mc->zero_block
+			);
+
+			SET_BLOCK
+			(
+				vb + 18,
+				macroblock->cbp, macroblock->mbx, macroblock->mby,
+				unit.x, unit.y, half.x, half.y, half.x, half.y,
+				4, 2, 1, mc->zero_block
+			);
+
+			mc->pipe->winsys->buffer_unmap(mc->pipe->winsys, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][0].buffer);
+
+			break;
+		}
+		default:
+			assert(0);
+	}
+
+	return 0;
+}
+
+static int vlFlush
+(
+	struct vlRender *render
+)
+{
+	struct vlR16SnormBufferedMC	*mc;
+	struct pipe_context		*pipe;
+	struct vlVertexShaderConsts	*vs_consts;
+	unsigned int			num_macroblocks[vlNumMacroBlockExTypes] = {0};
+	unsigned int			offset[vlNumMacroBlockExTypes];
+	unsigned int			vb_start = 0;
+	unsigned int			mbw;
+	unsigned int			mbh;
+	unsigned int			num_mb_per_frame;
+	unsigned int			i;
+
+	assert(render);
+
+	mc = (struct vlR16SnormBufferedMC*)render;
+
+	if (!mc->buffered_surface)
+		return 0;
+
+	mbw = align(mc->picture_width, VL_MACROBLOCK_WIDTH) / VL_MACROBLOCK_WIDTH;
+	mbh = align(mc->picture_height, VL_MACROBLOCK_HEIGHT) / VL_MACROBLOCK_HEIGHT;
+	num_mb_per_frame = mbw * mbh;
+
+	if (mc->num_macroblocks < num_mb_per_frame)
+		return 0;
+
+	pipe = mc->pipe;
+
+	for (i = 0; i < mc->num_macroblocks; ++i)
+	{
+		enum vlMacroBlockTypeEx mb_type_ex = vlGetMacroBlockTypeEx(&mc->macroblocks[i]);
+
+		num_macroblocks[mb_type_ex]++;
+	}
+
+	offset[0] = 0;
+
+	for (i = 1; i < vlNumMacroBlockExTypes; ++i)
+		offset[i] = offset[i - 1] + num_macroblocks[i - 1];
+
+	for (i = 0; i < mc->num_macroblocks; ++i)
+	{
+		enum vlMacroBlockTypeEx mb_type_ex = vlGetMacroBlockTypeEx(&mc->macroblocks[i]);
+
+		vlGrabMacroBlockVB(mc, &mc->macroblocks[i], offset[mb_type_ex]);
+
+		offset[mb_type_ex]++;
+	}
+
+	mc->render_target.cbufs[0] = pipe->screen->get_tex_surface
+	(
+		pipe->screen,
+		mc->buffered_surface->texture,
+		0, 0, 0, PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE
+	);
+
+	pipe->set_framebuffer_state(pipe, &mc->render_target);
+	pipe->set_viewport_state(pipe, &mc->viewport);
+	vs_consts = pipe->winsys->buffer_map
+	(
+		pipe->winsys,
+		mc->vs_const_buf.buffer,
+		PIPE_BUFFER_USAGE_CPU_WRITE
+	);
+
+	vs_consts->denorm.x = mc->buffered_surface->texture->width[0];
+	vs_consts->denorm.y = mc->buffered_surface->texture->height[0];
+
+	pipe->winsys->buffer_unmap(pipe->winsys, mc->vs_const_buf.buffer);
+	pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &mc->vs_const_buf);
+	pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, &mc->fs_const_buf);
+
+	if (num_macroblocks[vlMacroBlockExTypeIntra] > 0)
+	{
+		pipe->set_vertex_buffers(pipe, 1, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->set_vertex_elements(pipe, 4, mc->vertex_elems);
+		pipe->set_sampler_textures(pipe, 3, mc->textures[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->bind_sampler_states(pipe, 3, (void**)mc->samplers);
+		pipe->bind_vs_state(pipe, mc->i_vs);
+		pipe->bind_fs_state(pipe, mc->i_fs);
+
+		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, vb_start, num_macroblocks[vlMacroBlockExTypeIntra] * 24);
+		vb_start += num_macroblocks[vlMacroBlockExTypeIntra] * 24;
+	}
+
+	if (num_macroblocks[vlMacroBlockExTypeFwdPredictedFrame] > 0)
+	{
+		pipe->set_vertex_buffers(pipe, 2, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->set_vertex_elements(pipe, 6, mc->vertex_elems);
+		mc->textures[mc->cur_buf % NUM_BUF_SETS][3] = mc->past_surface->texture;
+		pipe->set_sampler_textures(pipe, 4, mc->textures[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->bind_sampler_states(pipe, 4, (void**)mc->samplers);
+		pipe->bind_vs_state(pipe, mc->p_vs[0]);
+		pipe->bind_fs_state(pipe, mc->p_fs[0]);
+
+		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, vb_start, num_macroblocks[vlMacroBlockExTypeFwdPredictedFrame] * 24);
+		vb_start += num_macroblocks[vlMacroBlockExTypeFwdPredictedFrame] * 24;
+	}
+
+	if (num_macroblocks[vlMacroBlockExTypeFwdPredictedField] > 0)
+	{
+		pipe->set_vertex_buffers(pipe, 2, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->set_vertex_elements(pipe, 6, mc->vertex_elems);
+		mc->textures[mc->cur_buf % NUM_BUF_SETS][3] = mc->past_surface->texture;
+		pipe->set_sampler_textures(pipe, 4, mc->textures[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->bind_sampler_states(pipe, 4, (void**)mc->samplers);
+		pipe->bind_vs_state(pipe, mc->p_vs[1]);
+		pipe->bind_fs_state(pipe, mc->p_fs[1]);
+
+		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, vb_start, num_macroblocks[vlMacroBlockExTypeFwdPredictedField] * 24);
+		vb_start += num_macroblocks[vlMacroBlockExTypeFwdPredictedField] * 24;
+	}
+
+	if (num_macroblocks[vlMacroBlockExTypeBkwdPredictedFrame] > 0)
+	{
+		pipe->set_vertex_buffers(pipe, 2, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->set_vertex_elements(pipe, 6, mc->vertex_elems);
+		mc->textures[mc->cur_buf % NUM_BUF_SETS][3] = mc->future_surface->texture;
+		pipe->set_sampler_textures(pipe, 4, mc->textures[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->bind_sampler_states(pipe, 4, (void**)mc->samplers);
+		pipe->bind_vs_state(pipe, mc->p_vs[0]);
+		pipe->bind_fs_state(pipe, mc->p_fs[0]);
+
+		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, vb_start, num_macroblocks[vlMacroBlockExTypeBkwdPredictedFrame] * 24);
+		vb_start += num_macroblocks[vlMacroBlockExTypeBkwdPredictedFrame] * 24;
+	}
+
+	if (num_macroblocks[vlMacroBlockExTypeBkwdPredictedField] > 0)
+	{
+		pipe->set_vertex_buffers(pipe, 2, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->set_vertex_elements(pipe, 6, mc->vertex_elems);
+		mc->textures[mc->cur_buf % NUM_BUF_SETS][3] = mc->future_surface->texture;
+		pipe->set_sampler_textures(pipe, 4, mc->textures[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->bind_sampler_states(pipe, 4, (void**)mc->samplers);
+		pipe->bind_vs_state(pipe, mc->p_vs[1]);
+		pipe->bind_fs_state(pipe, mc->p_fs[1]);
+
+		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, vb_start, num_macroblocks[vlMacroBlockExTypeBkwdPredictedField] * 24);
+		vb_start += num_macroblocks[vlMacroBlockExTypeBkwdPredictedField] * 24;
+	}
+
+	if (num_macroblocks[vlMacroBlockExTypeBiPredictedFrame] > 0)
+	{
+		pipe->set_vertex_buffers(pipe, 3, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->set_vertex_elements(pipe, 8, mc->vertex_elems);
+		mc->textures[mc->cur_buf % NUM_BUF_SETS][3] = mc->past_surface->texture;
+		mc->textures[mc->cur_buf % NUM_BUF_SETS][4] = mc->future_surface->texture;
+		pipe->set_sampler_textures(pipe, 5, mc->textures[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->bind_sampler_states(pipe, 5, (void**)mc->samplers);
+		pipe->bind_vs_state(pipe, mc->b_vs[0]);
+		pipe->bind_fs_state(pipe, mc->b_fs[0]);
+
+		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, vb_start, num_macroblocks[vlMacroBlockExTypeBiPredictedFrame] * 24);
+		vb_start += num_macroblocks[vlMacroBlockExTypeBiPredictedFrame] * 24;
+	}
+
+	if (num_macroblocks[vlMacroBlockExTypeBiPredictedField] > 0)
+	{
+		pipe->set_vertex_buffers(pipe, 3, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->set_vertex_elements(pipe, 8, mc->vertex_elems);
+		mc->textures[mc->cur_buf % NUM_BUF_SETS][3] = mc->past_surface->texture;
+		mc->textures[mc->cur_buf % NUM_BUF_SETS][4] = mc->future_surface->texture;
+		pipe->set_sampler_textures(pipe, 5, mc->textures[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->bind_sampler_states(pipe, 5, (void**)mc->samplers);
+		pipe->bind_vs_state(pipe, mc->b_vs[1]);
+		pipe->bind_fs_state(pipe, mc->b_fs[1]);
+
+		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, vb_start, num_macroblocks[vlMacroBlockExTypeBiPredictedField] * 24);
+		vb_start += num_macroblocks[vlMacroBlockExTypeBiPredictedField] * 24;
+	}
+
+	pipe->flush(pipe, PIPE_FLUSH_RENDER_CACHE, &mc->buffered_surface->render_fence);
+
+	for (i = 0; i < 3; ++i)
+		mc->zero_block[i].x = -1.0f;
+
+	mc->buffered_surface = NULL;
+	mc->num_macroblocks = 0;
+	mc->cur_buf++;
+
+	return 0;
+}
+
+static int vlRenderMacroBlocksMpeg2R16SnormBuffered
+(
+	struct vlRender *render,
+	struct vlMpeg2MacroBlockBatch *batch,
+	struct vlSurface *surface
+)
+{
+	struct vlR16SnormBufferedMC	*mc;
+	unsigned int			i;
+
+	assert(render);
+
+	mc = (struct vlR16SnormBufferedMC*)render;
+
+	if (mc->buffered_surface)
+	{
+		if (mc->buffered_surface != surface)
+		{
+			vlFlush(&mc->base);
+			mc->buffered_surface = surface;
+			mc->past_surface = batch->past_surface;
+			mc->future_surface = batch->future_surface;
+			mc->surface_tex_inv_size.x = 1.0f / surface->texture->width[0];
+			mc->surface_tex_inv_size.y = 1.0f / surface->texture->height[0];
+		}
+	}
+	else
+	{
+		mc->buffered_surface = surface;
+		mc->past_surface = batch->past_surface;
+		mc->future_surface = batch->future_surface;
+		mc->surface_tex_inv_size.x = 1.0f / surface->texture->width[0];
+		mc->surface_tex_inv_size.y = 1.0f / surface->texture->height[0];
+	}
+
+	for (i = 0; i < batch->num_macroblocks; ++i)
+		vlGrabMacroBlock(mc, &batch->macroblocks[i]);
+
+	return 0;
+}
+
+static int vlEnd
+(
+	struct vlRender *render
+)
+{
+	assert(render);
+
+	return 0;
+}
+
+static int vlDestroy
+(
+	struct vlRender *render
+)
+{
+	struct vlR16SnormBufferedMC	*mc;
+	struct pipe_context		*pipe;
+	unsigned int			h, i;
+
+	assert(render);
+
+	mc = (struct vlR16SnormBufferedMC*)render;
+	pipe = mc->pipe;
+
+	for (i = 0; i < 5; ++i)
+		pipe->delete_sampler_state(pipe, mc->samplers[i]);
+
+	for (h = 0; h < NUM_BUF_SETS; ++h)
+			for (i = 0; i < 3; ++i)
+				pipe->winsys->buffer_destroy(pipe->winsys, mc->vertex_bufs[h][i].buffer);
+
+	/* Textures 3 & 4 are not created directly, no need to release them here */
+	for (i = 0; i < NUM_BUF_SETS; ++i)
+	{
+		pipe_texture_release(&mc->textures[i][0]);
+		pipe_texture_release(&mc->textures[i][1]);
+		pipe_texture_release(&mc->textures[i][2]);
+	}
+
+	pipe->delete_vs_state(pipe, mc->i_vs);
+	pipe->delete_fs_state(pipe, mc->i_fs);
+
+	for (i = 0; i < 2; ++i)
+	{
+		pipe->delete_vs_state(pipe, mc->p_vs[i]);
+		pipe->delete_fs_state(pipe, mc->p_fs[i]);
+		pipe->delete_vs_state(pipe, mc->b_vs[i]);
+		pipe->delete_fs_state(pipe, mc->b_fs[i]);
+	}
+
+	pipe->winsys->buffer_destroy(pipe->winsys, mc->vs_const_buf.buffer);
+	pipe->winsys->buffer_destroy(pipe->winsys, mc->fs_const_buf.buffer);
+
+	free(mc->macroblocks);
+	free(mc);
+
+	return 0;
+}
+
+/*
+ * Muliplier renormalizes block samples from 16 bits to 12 bits.
+ * Divider is used when calculating Y % 2 for choosing top or bottom
+ * field for P or B macroblocks.
+ * TODO: Use immediates.
+ */
+static const struct vlFragmentShaderConsts fs_consts =
+{
+	{32767.0f / 255.0f, 32767.0f / 255.0f, 32767.0f / 255.0f, 0.0f},
+	{0.5f, 2.0f, 0.0f, 0.0f}
+};
+
+static int vlCreateVertexShaderIMB
+(
+	struct vlR16SnormBufferedMC *mc
+)
+{
+	const unsigned int		max_tokens = 50;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Luma texcoords
+	 * decl i2		; Chroma Cb texcoords
+	 * decl i3		; Chroma Cr texcoords
+	 */
+	for (i = 0; i < 4; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Luma texcoords
+	 * decl o2		; Chroma Cb texcoords
+	 * decl o3		; Chroma Cr texcoords
+	 */
+	for (i = 0; i < 4; i++)
+	{
+		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * mov o0, i0		; Move input vertex pos to output
+	 * mov o1, i1		; Move input luma texcoords to output
+	 * mov o2, i2		; Move input chroma Cb texcoords to output
+	 * mov o3, i3		; Move input chroma Cr texcoords to output
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	mc->i_vs = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShaderIMB
+(
+	struct vlR16SnormBufferedMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0			; Luma texcoords
+	 * decl i1			; Chroma Cb texcoords
+	 * decl i2			; Chroma Cr texcoords
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0			; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0, t1 */
+	decl = vl_decl_temps(0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl s0			; Sampler for luma texture
+	 * decl s1			; Sampler for chroma Cb texture
+	 * decl s2			; Sampler for chroma Cr texture
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		decl = vl_decl_samplers(i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header,max_tokens - ti);
+	}
+
+	/*
+	 * tex2d t1, i0, s0		; Read texel from luma texture
+	 * mov t0.x, t1.x		; Move luma sample into .x component
+	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
+	 * mov t0.y, t1.x		; Move Cb sample into .y component
+	 * tex2d t1, i2, s2		; Read texel from chroma Cr texture
+	 * mov t0.z, t1.x		; Move Cr sample into .z component
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* mul o0, t0, c0		; Rescale texel to correct range */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	mc->i_fs = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateVertexShaderFramePMB
+(
+	struct vlR16SnormBufferedMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Luma texcoords
+	 * decl i2		; Chroma Cb texcoords
+	 * decl i3		; Chroma Cr texcoords
+	 * decl i4		; Ref surface top field texcoords
+	 * decl i5		; Ref surface bottom field texcoords (unused, packed in the same stream)
+	 */
+	for (i = 0; i < 6; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Luma texcoords
+	 * decl o2		; Chroma Cb texcoords
+	 * decl o3		; Chroma Cr texcoords
+	 * decl o4		; Ref macroblock texcoords
+	 */
+	for (i = 0; i < 5; i++)
+	{
+		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * mov o0, i0		; Move input vertex pos to output
+	 * mov o1, i1		; Move input luma texcoords to output
+	 * mov o2, i2		; Move input chroma Cb texcoords to output
+	 * mov o3, i3		; Move input chroma Cr texcoords to output
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* add o4, i0, i4	; Translate vertex pos by motion vec to form ref macroblock texcoords */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 4, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, 4);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	mc->p_vs[0] = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateVertexShaderFieldPMB
+(
+	struct vlR16SnormBufferedMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Luma texcoords
+	 * decl i2		; Chroma Cb texcoords
+	 * decl i3		; Chroma Cr texcoords
+	 * decl i4              ; Ref macroblock top field texcoords
+	 * decl i5              ; Ref macroblock bottom field texcoords
+	 */
+	for (i = 0; i < 6; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl c0		; Render target dimensions */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Luma texcoords
+	 * decl o2		; Chroma Cb texcoords
+	 * decl o3		; Chroma Cr texcoords
+	 * decl o4		; Ref macroblock top field texcoords
+	 * decl o5		; Ref macroblock bottom field texcoords
+	 * decl o6		; Denormalized vertex pos
+	 */
+	for (i = 0; i < 7; i++)
+	{
+		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * mov o0, i0		; Move input vertex pos to output
+	 * mov o1, i1		; Move input luma texcoords to output
+	 * mov o2, i2		; Move input chroma Cb texcoords to output
+	 * mov o3, i3		; Move input chroma Cr texcoords to output
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * add o4, i0, i4	; Translate vertex pos by motion vec to form top field macroblock texcoords
+	 * add o5, i0, i5	; Translate vertex pos by motion vec to form bottom field macroblock texcoords
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 4, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, i + 4);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* mul o6, i0, c0	; Denorm vertex pos */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 6, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	mc->p_vs[1] = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShaderFramePMB
+(
+	struct vlR16SnormBufferedMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0			; Luma texcoords
+	 * decl i1			; Chroma Cb texcoords
+	 * decl i2			; Chroma Cr texcoords
+	 * decl i3			; Ref macroblock texcoords
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0			; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0, t1 */
+	decl = vl_decl_temps(0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl s0			; Sampler for luma texture
+	 * decl s1			; Sampler for chroma Cb texture
+	 * decl s2			; Sampler for chroma Cr texture
+	 * decl s3			; Sampler for ref surface texture
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		decl = vl_decl_samplers(i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * tex2d t1, i0, s0		; Read texel from luma texture
+	 * mov t0.x, t1.x		; Move luma sample into .x component
+	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
+	 * mov t0.y, t1.x		; Move Cb sample into .y component
+	 * tex2d t1, i2, s2		; Read texel from chroma Cr texture
+	 * mov t0.z, t1.x		; Move Cr sample into .z component
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* mul t0, t0, c0		; Rescale texel to correct range */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* tex2d t1, i3, s3		; Read texel from ref macroblock */
+	inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, 3, TGSI_FILE_SAMPLER, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, t1		; Add ref and differential to form final output */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	mc->p_fs[0] = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShaderFieldPMB
+(
+	struct vlR16SnormBufferedMC *mc
+)
+{
+	const unsigned int		max_tokens = 200;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0			; Luma texcoords
+	 * decl i1			; Chroma Cb texcoords
+	 * decl i2			; Chroma Cr texcoords
+	 * decl i3			; Ref macroblock top field texcoords
+	 * decl i4			; Ref macroblock bottom field texcoords
+	 * decl i5			; Denormalized vertex pos
+	 */
+	for (i = 0; i < 6; ++i)
+	{
+		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm
+	 * decl c1			; Constants 1/2 & 2 in .x, .y channels for Y-mod-2 top/bottom field selection
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0			; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0-t4 */
+	decl = vl_decl_temps(0, 4);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl s0			; Sampler for luma texture
+	 * decl s1			; Sampler for chroma Cb texture
+	 * decl s2			; Sampler for chroma Cr texture
+	 * decl s3			; Sampler for ref surface texture
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		decl = vl_decl_samplers(i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * tex2d t1, i0, s0		; Read texel from luma texture
+	 * mov t0.x, t1.x		; Move luma sample into .x component
+	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
+	 * mov t0.y, t1.x		; Move Cb sample into .y component
+	 * tex2d t1, i2, s2		; Read texel from chroma Cr texture
+	 * mov t0.z, t1.x		; Move Cr sample into .z component
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* mul t0, t0, c0		; Rescale texel to correct range */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * tex2d t1, i3, s3		; Read texel from ref macroblock top field
+	 * tex2d t2, i4, s3		; Read texel from ref macroblock bottom field
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 3, TGSI_FILE_SAMPLER, 3);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* XXX: Pos values off by 0.5? */
+	/* sub t4, i5.y, c1.x		; Sub 0.5 from denormalized pos */
+	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_INPUT, 5, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t3, t4, c1.x		; Multiply pos Y-coord by 1/2 */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* floor t3, t3			; Get rid of fractional part */
+	inst = vl_inst2(TGSI_OPCODE_FLOOR, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t3, t3, c1.y		; Multiply by 2 */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* sub t3, t4, t3		; Subtract from original Y to get Y % 2 */
+	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* TODO: Move to conditional tex fetch on t3 instead of lerp */
+	/* lerp t1, t3, t1, t2		; Choose between top and bottom fields based on Y % 2 */
+	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, t1		; Add ref and differential to form final output */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	mc->p_fs[1] = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateVertexShaderFrameBMB
+(
+	struct vlR16SnormBufferedMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Luma texcoords
+	 * decl i2		; Chroma Cb texcoords
+	 * decl i3		; Chroma Cr texcoords
+	 * decl i4              ; First ref macroblock top field texcoords
+	 * decl i5              ; First ref macroblock bottom field texcoords (unused, packed in the same stream)
+	 * decl i6		; Second ref macroblock top field texcoords
+	 * decl i7		; Second ref macroblock bottom field texcoords (unused, packed in the same stream)
+	 */
+	for (i = 0; i < 8; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Luma texcoords
+	 * decl o2		; Chroma Cb texcoords
+	 * decl o3		; Chroma Cr texcoords
+	 * decl o4		; First ref macroblock texcoords
+	 * decl o5		; Second ref macroblock texcoords
+	 */
+	for (i = 0; i < 6; i++)
+	{
+		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * mov o0, i0		; Move input vertex pos to output
+	 * mov o1, i1		; Move input luma texcoords to output
+	 * mov o2, i2		; Move input chroma Cb texcoords to output
+	 * mov o3, i3		; Move input chroma Cr texcoords to output
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * add o4, i0, i4	; Translate vertex pos by motion vec to form first ref macroblock texcoords
+	 * add o5, i0, i6	; Translate vertex pos by motion vec to form second ref macroblock texcoords
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 4, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, (i + 2) * 2);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	mc->b_vs[0] = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateVertexShaderFieldBMB
+(
+	struct vlR16SnormBufferedMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Luma texcoords
+	 * decl i2		; Chroma Cb texcoords
+	 * decl i3		; Chroma Cr texcoords
+	 * decl i4              ; First ref macroblock top field texcoords
+	 * decl i5              ; First ref macroblock bottom field texcoords
+	 * decl i6              ; Second ref macroblock top field texcoords
+	 * decl i7              ; Second ref macroblock bottom field texcoords
+	 */
+	for (i = 0; i < 8; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl c0		; Render target dimensions */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Luma texcoords
+	 * decl o2		; Chroma Cb texcoords
+	 * decl o3		; Chroma Cr texcoords
+	 * decl o4		; First ref macroblock top field texcoords
+	 * decl o5		; First ref macroblock Bottom field texcoords
+	 * decl o6		; Second ref macroblock top field texcoords
+	 * decl o7		; Second ref macroblock Bottom field texcoords
+	 * decl o8		; Denormalized vertex pos
+	 */
+	for (i = 0; i < 9; i++)
+	{
+		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl t0, t1 */
+	decl = vl_decl_temps(0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * mov o0, i0		; Move input vertex pos to output
+	 * mov o1, i1		; Move input luma texcoords to output
+	 * mov o2, i2		; Move input chroma Cb texcoords to output
+	 * mov o3, i3		; Move input chroma Cr texcoords to output
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * add o4, i0, i4	; Translate vertex pos by motion vec to form first top field macroblock texcoords
+	 * add o5, i0, i5	; Translate vertex pos by motion vec to form first bottom field macroblock texcoords
+	 * add o6, i0, i6	; Translate vertex pos by motion vec to form second top field macroblock texcoords
+	 * add o7, i0, i7	; Translate vertex pos by motion vec to form second bottom field macroblock texcoords
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 4, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, i + 4);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* mul o8, i0, c0	; Denorm vertex pos */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 8, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	mc->b_vs[1] = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShaderFrameBMB
+(
+	struct vlR16SnormBufferedMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0			; Luma texcoords
+	 * decl i1			; Chroma Cb texcoords
+	 * decl i2			; Chroma Cr texcoords
+	 * decl i3			; First ref macroblock texcoords
+	 * decl i4			; Second ref macroblock texcoords
+	 */
+	for (i = 0; i < 5; ++i)
+	{
+		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm
+	 * decl c1			; Constant 1/2 in .x channel to use as weight to blend past and future texels
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0			; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0-t2 */
+	decl = vl_decl_temps(0, 2);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl s0			; Sampler for luma texture
+	 * decl s1			; Sampler for chroma Cb texture
+	 * decl s2			; Sampler for chroma Cr texture
+	 * decl s3			; Sampler for first ref surface texture
+	 * decl s4			; Sampler for second ref surface texture
+	 */
+	for (i = 0; i < 5; ++i)
+	{
+		decl = vl_decl_samplers(i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * tex2d t1, i0, s0		; Read texel from luma texture
+	 * mov t0.x, t1.x		; Move luma sample into .x component
+	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
+	 * mov t0.y, t1.x		; Move Cb sample into .y component
+	 * tex2d t1, i2, s2		; Read texel from chroma Cr texture
+	 * mov t0.z, t1.x		; Move Cr sample into .z component
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* mul t0, t0, c0		; Rescale texel to correct range */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * tex2d t1, i3, s3		; Read texel from first ref macroblock
+	 * tex2d t2, i4, s4		; Read texel from second ref macroblock
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 3, TGSI_FILE_SAMPLER, i + 3);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* lerp t1, c1.x, t1, t2	; Blend past and future texels */
+	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 1, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, t1		; Add past/future ref and differential to form final output */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	mc->b_fs[0] = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShaderFieldBMB
+(
+	struct vlR16SnormBufferedMC *mc
+)
+{
+	const unsigned int		max_tokens = 200;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0			; Luma texcoords
+	 * decl i1			; Chroma Cb texcoords
+	 * decl i2			; Chroma Cr texcoords
+	 * decl i3			; First ref macroblock top field texcoords
+	 * decl i4			; First ref macroblock bottom field texcoords
+	 * decl i5			; Second ref macroblock top field texcoords
+	 * decl i6			; Second ref macroblock bottom field texcoords
+	 * decl i7			; Denormalized vertex pos
+	 */
+	for (i = 0; i < 8; ++i)
+	{
+		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm
+	 * decl c1			; Constants 1/2 & 2 in .x, .y channels to use as weight to blend past and future texels
+	 *				; and for Y-mod-2 top/bottom field selection
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0			; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0-t5 */
+	decl = vl_decl_temps(0, 5);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl s0			; Sampler for luma texture
+	 * decl s1			; Sampler for chroma Cb texture
+	 * decl s2			; Sampler for chroma Cr texture
+	 * decl s3			; Sampler for first ref surface texture
+	 * decl s4			; Sampler for second ref surface texture
+	 */
+	for (i = 0; i < 5; ++i)
+	{
+		decl = vl_decl_samplers(i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * tex2d t1, i0, s0		; Read texel from luma texture
+	 * mov t0.x, t1.x		; Move luma sample into .x component
+	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
+	 * mov t0.y, t1.x		; Move Cb sample into .y component
+	 * tex2d t1, i2, s2		; Read texel from chroma Cr texture
+	 * mov t0.z, t1.x		; Move Cr sample into .z component
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* mul t0, t0, c0		; Rescale texel to correct range */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* XXX: Pos values off by 0.5? */
+	/* sub t4, i7.y, c1.x		; Sub 0.5 from denormalized pos */
+	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_INPUT, 7, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t3, t4, c1.x		; Multiply pos Y-coord by 1/2 */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* floor t3, t3			; Get rid of fractional part */
+	inst = vl_inst2(TGSI_OPCODE_FLOOR, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t3, t3, c1.y		; Multiply by 2 */
+	inst = vl_inst3( TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* sub t3, t4, t3		; Subtract from original Y to get Y % 2 */
+	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * tex2d t1, i3, s3		; Read texel from past ref macroblock top field
+	 * tex2d t2, i4, s3		; Read texel from past ref macroblock bottom field
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 3, TGSI_FILE_SAMPLER, 3);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* TODO: Move to conditional tex fetch on t3 instead of lerp */
+	/* lerp t1, t3, t1, t2		; Choose between top and bottom fields based on Y % 2 */
+	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * tex2d t4, i5, s4		; Read texel from future ref macroblock top field
+	 * tex2d t5, i6, s4		; Read texel from future ref macroblock bottom field
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 4, TGSI_FILE_INPUT, i + 5, TGSI_FILE_SAMPLER, 4);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* TODO: Move to conditional tex fetch on t3 instead of lerp */
+	/* lerp t2, t3, t4, t5		; Choose between top and bottom fields based on Y % 2 */
+	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 2, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 5);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* lerp t1, c1.x, t1, t2	; Blend past and future texels */
+	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 1, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, t1		; Add past/future ref and differential to form final output */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	mc->b_fs[1] = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateDataBufs
+(
+	struct vlR16SnormBufferedMC *mc
+)
+{
+	const unsigned int	mbw = align(mc->picture_width, VL_MACROBLOCK_WIDTH) / VL_MACROBLOCK_WIDTH;
+	const unsigned int	mbh = align(mc->picture_height, VL_MACROBLOCK_HEIGHT) / VL_MACROBLOCK_HEIGHT;
+	const unsigned int	num_mb_per_frame = mbw * mbh;
+
+	struct pipe_context	*pipe;
+	unsigned int		h, i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+
+	/* Create our vertex buffers */
+	for (h = 0; h < NUM_BUF_SETS; ++h)
+	{
+		mc->vertex_bufs[h][0].pitch = sizeof(struct vlVertex2f) * 4;
+		mc->vertex_bufs[h][0].max_index = 24 * num_mb_per_frame - 1;
+		mc->vertex_bufs[h][0].buffer_offset = 0;
+		mc->vertex_bufs[h][0].buffer = pipe->winsys->buffer_create
+		(
+			pipe->winsys,
+			1,
+			PIPE_BUFFER_USAGE_VERTEX,
+			sizeof(struct vlVertex2f) * 4 * 24 * num_mb_per_frame
+		);
+
+		for (i = 1; i < 3; ++i)
+		{
+			mc->vertex_bufs[h][i].pitch = sizeof(struct vlVertex2f) * 2;
+			mc->vertex_bufs[h][i].max_index = 24 * num_mb_per_frame - 1;
+			mc->vertex_bufs[h][i].buffer_offset = 0;
+			mc->vertex_bufs[h][i].buffer = pipe->winsys->buffer_create
+			(
+				pipe->winsys,
+				1,
+				PIPE_BUFFER_USAGE_VERTEX,
+				sizeof(struct vlVertex2f) * 2 * 24 * num_mb_per_frame
+			);
+		}
+	}
+
+	/* Position element */
+	mc->vertex_elems[0].src_offset = 0;
+	mc->vertex_elems[0].vertex_buffer_index = 0;
+	mc->vertex_elems[0].nr_components = 2;
+	mc->vertex_elems[0].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+	/* Luma, texcoord element */
+	mc->vertex_elems[1].src_offset = sizeof(struct vlVertex2f);
+	mc->vertex_elems[1].vertex_buffer_index = 0;
+	mc->vertex_elems[1].nr_components = 2;
+	mc->vertex_elems[1].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+	/* Chroma Cr texcoord element */
+	mc->vertex_elems[2].src_offset = sizeof(struct vlVertex2f) * 2;
+	mc->vertex_elems[2].vertex_buffer_index = 0;
+	mc->vertex_elems[2].nr_components = 2;
+	mc->vertex_elems[2].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+	/* Chroma Cb texcoord element */
+	mc->vertex_elems[3].src_offset = sizeof(struct vlVertex2f) * 3;
+	mc->vertex_elems[3].vertex_buffer_index = 0;
+	mc->vertex_elems[3].nr_components = 2;
+	mc->vertex_elems[3].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+	/* First ref surface top field texcoord element */
+	mc->vertex_elems[4].src_offset = 0;
+	mc->vertex_elems[4].vertex_buffer_index = 1;
+	mc->vertex_elems[4].nr_components = 2;
+	mc->vertex_elems[4].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+	/* First ref surface bottom field texcoord element */
+	mc->vertex_elems[5].src_offset = sizeof(struct vlVertex2f);
+	mc->vertex_elems[5].vertex_buffer_index = 1;
+	mc->vertex_elems[5].nr_components = 2;
+	mc->vertex_elems[5].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+	/* Second ref surface top field texcoord element */
+	mc->vertex_elems[6].src_offset = 0;
+	mc->vertex_elems[6].vertex_buffer_index = 2;
+	mc->vertex_elems[6].nr_components = 2;
+	mc->vertex_elems[6].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+	/* Second ref surface bottom field texcoord element */
+	mc->vertex_elems[7].src_offset = sizeof(struct vlVertex2f);
+	mc->vertex_elems[7].vertex_buffer_index = 2;
+	mc->vertex_elems[7].nr_components = 2;
+	mc->vertex_elems[7].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+	/* Create our constant buffer */
+	mc->vs_const_buf.size = sizeof(struct vlVertexShaderConsts);
+	mc->vs_const_buf.buffer = pipe->winsys->buffer_create
+	(
+		pipe->winsys,
+		1,
+		PIPE_BUFFER_USAGE_CONSTANT,
+		mc->vs_const_buf.size
+	);
+
+	mc->fs_const_buf.size = sizeof(struct vlFragmentShaderConsts);
+	mc->fs_const_buf.buffer = pipe->winsys->buffer_create
+	(
+		pipe->winsys,
+		1,
+		PIPE_BUFFER_USAGE_CONSTANT,
+		mc->fs_const_buf.size
+	);
+
+	memcpy
+	(
+		pipe->winsys->buffer_map(pipe->winsys, mc->fs_const_buf.buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+		&fs_consts,
+		sizeof(struct vlFragmentShaderConsts)
+	);
+
+	pipe->winsys->buffer_unmap(pipe->winsys, mc->fs_const_buf.buffer);
+
+	mc->macroblocks = malloc(sizeof(struct vlMpeg2MacroBlock) * num_mb_per_frame);
+
+	return 0;
+}
+
+static int vlInit
+(
+	struct vlR16SnormBufferedMC *mc
+)
+{
+	struct pipe_context		*pipe;
+	struct pipe_sampler_state	sampler;
+	struct pipe_texture		template;
+	unsigned int			filters[5];
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+
+	/* For MC we render to textures, which are rounded up to nearest POT */
+	mc->viewport.scale[0] = vlRoundUpPOT(mc->picture_width);
+	mc->viewport.scale[1] = vlRoundUpPOT(mc->picture_height);
+	mc->viewport.scale[2] = 1;
+	mc->viewport.scale[3] = 1;
+	mc->viewport.translate[0] = 0;
+	mc->viewport.translate[1] = 0;
+	mc->viewport.translate[2] = 0;
+	mc->viewport.translate[3] = 0;
+
+	mc->render_target.width = vlRoundUpPOT(mc->picture_width);
+	mc->render_target.height = vlRoundUpPOT(mc->picture_height);
+	mc->render_target.num_cbufs = 1;
+	/* FB for MC stage is a vlSurface created by the user, set at render time */
+	mc->render_target.zsbuf = NULL;
+
+	filters[0] = PIPE_TEX_FILTER_NEAREST;
+	/* FIXME: Linear causes discoloration around block edges */
+	filters[1] = /*mc->picture_format == vlFormatYCbCr444 ?*/ PIPE_TEX_FILTER_NEAREST /*: PIPE_TEX_FILTER_LINEAR*/;
+	filters[2] = /*mc->picture_format == vlFormatYCbCr444 ?*/ PIPE_TEX_FILTER_NEAREST /*: PIPE_TEX_FILTER_LINEAR*/;
+	filters[3] = PIPE_TEX_FILTER_LINEAR;
+	filters[4] = PIPE_TEX_FILTER_LINEAR;
+
+	for (i = 0; i < 5; ++i)
+	{
+		sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+		sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+		sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+		sampler.min_img_filter = filters[i];
+		sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+		sampler.mag_img_filter = filters[i];
+		sampler.compare_mode = PIPE_TEX_COMPARE_NONE;
+		sampler.compare_func = PIPE_FUNC_ALWAYS;
+		sampler.normalized_coords = 1;
+		/*sampler.prefilter = ;*/
+		/*sampler.shadow_ambient = ;*/
+		/*sampler.lod_bias = ;*/
+		sampler.min_lod = 0;
+		/*sampler.max_lod = ;*/
+		/*sampler.border_color[i] = ;*/
+		/*sampler.max_anisotropy = ;*/
+		mc->samplers[i] = pipe->create_sampler_state(pipe, &sampler);
+	}
+
+	memset(&template, 0, sizeof(struct pipe_texture));
+	template.target = PIPE_TEXTURE_2D;
+	template.format = PIPE_FORMAT_R16_SNORM;
+	template.last_level = 0;
+	template.width[0] = vlRoundUpPOT(mc->picture_width);
+	template.height[0] = vlRoundUpPOT(mc->picture_height);
+	template.depth[0] = 1;
+	template.compressed = 0;
+	pf_get_block(template.format, &template.block);
+
+	for (i = 0; i < NUM_BUF_SETS; ++i)
+		mc->textures[i][0] = pipe->screen->texture_create(pipe->screen, &template);
+
+	if (mc->picture_format == vlFormatYCbCr420)
+	{
+		template.width[0] = vlRoundUpPOT(mc->picture_width / 2);
+		template.height[0] = vlRoundUpPOT(mc->picture_height / 2);
+	}
+	else if (mc->picture_format == vlFormatYCbCr422)
+		template.height[0] = vlRoundUpPOT(mc->picture_height / 2);
+
+	for (i = 0; i < NUM_BUF_SETS; ++i)
+	{
+		mc->textures[i][1] = pipe->screen->texture_create(pipe->screen, &template);
+		mc->textures[i][2] = pipe->screen->texture_create(pipe->screen, &template);
+	}
+
+	/* textures[3] & textures[4] are assigned from vlSurfaces for P and B macroblocks at render time */
+
+	vlCreateVertexShaderIMB(mc);
+	vlCreateFragmentShaderIMB(mc);
+	vlCreateVertexShaderFramePMB(mc);
+	vlCreateVertexShaderFieldPMB(mc);
+	vlCreateFragmentShaderFramePMB(mc);
+	vlCreateFragmentShaderFieldPMB(mc);
+	vlCreateVertexShaderFrameBMB(mc);
+	vlCreateVertexShaderFieldBMB(mc);
+	vlCreateFragmentShaderFrameBMB(mc);
+	vlCreateFragmentShaderFieldBMB(mc);
+	vlCreateDataBufs(mc);
+
+	return 0;
+}
+
+int vlCreateR16SNormBufferedMC
+(
+	struct pipe_context *pipe,
+	unsigned int picture_width,
+	unsigned int picture_height,
+	enum vlFormat picture_format,
+	struct vlRender **render
+)
+{
+	struct vlR16SnormBufferedMC	*mc;
+	unsigned int			i;
+
+	assert(pipe);
+	assert(render);
+
+	mc = calloc(1, sizeof(struct vlR16SnormBufferedMC));
+
+	mc->base.vlBegin = &vlBegin;
+	mc->base.vlRenderMacroBlocksMpeg2 = &vlRenderMacroBlocksMpeg2R16SnormBuffered;
+	mc->base.vlEnd = &vlEnd;
+	mc->base.vlFlush = &vlFlush;
+	mc->base.vlDestroy = &vlDestroy;
+	mc->pipe = pipe;
+	mc->picture_width = picture_width;
+	mc->picture_height = picture_height;
+
+	mc->cur_buf = 0;
+	mc->buffered_surface = NULL;
+	mc->past_surface = NULL;
+	mc->future_surface = NULL;
+	for (i = 0; i < 3; ++i)
+		mc->zero_block[i].x = -1.0f;
+	mc->num_macroblocks = 0;
+
+	vlInit(mc);
+
+	*render = &mc->base;
+
+	return 0;
+}
diff --git a/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.h b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.h
new file mode 100644
index 00000000000..27177d64cad
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.h
@@ -0,0 +1,18 @@
+#ifndef vl_r16snorm_mc_buf_h
+#define vl_r16snorm_mc_buf_h
+
+#include "vl_types.h"
+
+struct pipe_context;
+struct vlRender;
+
+int vlCreateR16SNormBufferedMC
+(
+	struct pipe_context *pipe,
+	unsigned int picture_width,
+	unsigned int picture_height,
+	enum vlFormat picture_format,
+	struct vlRender **render
+);
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_render.h b/src/gallium/state_trackers/g3dvl/vl_render.h
new file mode 100644
index 00000000000..166030b4988
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_render.h
@@ -0,0 +1,38 @@
+#ifndef vl_render_h
+#define vl_render_h
+
+#include "vl_types.h"
+
+struct pipe_surface;
+
+struct vlRender
+{
+	int (*vlBegin)
+	(
+		struct vlRender *render
+	);
+
+	int (*vlRenderMacroBlocksMpeg2)
+	(
+		struct vlRender *render,
+		struct vlMpeg2MacroBlockBatch *batch,
+		struct vlSurface *surface
+	);
+
+	int (*vlEnd)
+	(
+		struct vlRender *render
+	);
+
+	int (*vlFlush)
+	(
+		struct vlRender *render
+	);
+
+	int (*vlDestroy)
+	(
+		struct vlRender *render
+	);
+};
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_screen.c b/src/gallium/state_trackers/g3dvl/vl_screen.c
new file mode 100644
index 00000000000..484f63b0d4d
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_screen.c
@@ -0,0 +1,115 @@
+#define VL_INTERNAL
+#include "vl_screen.h"
+#include <assert.h>
+#include <stdlib.h>
+
+int vlCreateScreen
+(
+	struct vlDisplay *display,
+	int screen,
+	struct pipe_screen *pscreen,
+	struct vlScreen **vl_screen
+)
+{
+	struct vlScreen *scrn;
+
+	assert(display);
+	assert(pscreen);
+	assert(vl_screen);
+
+	scrn = calloc(1, sizeof(struct vlScreen));
+
+	if (!scrn)
+		return 1;
+
+	scrn->display = display;
+	scrn->ordinal = screen;
+	scrn->pscreen = pscreen;
+	*vl_screen = scrn;
+
+	return 0;
+}
+
+int vlDestroyScreen
+(
+	struct vlScreen *screen
+)
+{
+	assert(screen);
+
+	free(screen);
+
+	return 0;
+}
+
+struct vlDisplay* vlGetDisplay
+(
+	struct vlScreen *screen
+)
+{
+	assert(screen);
+
+	return screen->display;
+}
+
+struct pipe_screen* vlGetPipeScreen
+(
+	struct vlScreen *screen
+)
+{
+	assert(screen);
+
+	return screen->pscreen;
+}
+
+unsigned int vlGetMaxProfiles
+(
+	struct vlScreen *screen
+)
+{
+	assert(screen);
+
+	return vlProfileCount;
+}
+
+int vlQueryProfiles
+(
+	struct vlScreen *screen,
+	enum vlProfile *profiles
+)
+{
+	assert(screen);
+	assert(profiles);
+
+	profiles[0] = vlProfileMpeg2Simple;
+	profiles[1] = vlProfileMpeg2Main;
+
+	return 0;
+}
+
+unsigned int vlGetMaxEntryPoints
+(
+	struct vlScreen *screen
+)
+{
+	assert(screen);
+
+	return vlEntryPointCount;
+}
+
+int vlQueryEntryPoints
+(
+	struct vlScreen *screen,
+	enum vlProfile profile,
+	enum vlEntryPoint *entry_points
+)
+{
+	assert(screen);
+	assert(entry_points);
+
+	entry_points[0] = vlEntryPointIDCT;
+	entry_points[1] = vlEntryPointMC;
+	entry_points[2] = vlEntryPointCSC;
+
+	return 0;
+}
diff --git a/src/gallium/state_trackers/g3dvl/vl_screen.h b/src/gallium/state_trackers/g3dvl/vl_screen.h
new file mode 100644
index 00000000000..98f3d429b61
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_screen.h
@@ -0,0 +1,63 @@
+#ifndef vl_screen_h
+#define vl_screen_h
+
+#include "vl_types.h"
+
+struct pipe_screen;
+
+#ifdef VL_INTERNAL
+struct vlScreen
+{
+	struct vlDisplay	*display;
+	unsigned int		ordinal;
+	struct pipe_screen	*pscreen;
+};
+#endif
+
+int vlCreateScreen
+(
+	struct vlDisplay *display,
+	int screen,
+	struct pipe_screen *pscreen,
+	struct vlScreen **vl_screen
+);
+
+int vlDestroyScreen
+(
+	struct vlScreen *screen
+);
+
+struct vlDisplay* vlGetDisplay
+(
+	struct vlScreen *screen
+);
+
+struct pipe_screen* vlGetPipeScreen
+(
+	struct vlScreen *screen
+);
+
+unsigned int vlGetMaxProfiles
+(
+	struct vlScreen *screen
+);
+
+int vlQueryProfiles
+(
+	struct vlScreen *screen,
+	enum vlProfile *profiles
+);
+
+unsigned int vlGetMaxEntryPoints
+(
+	struct vlScreen *screen
+);
+
+int vlQueryEntryPoints
+(
+	struct vlScreen *screen,
+	enum vlProfile profile,
+	enum vlEntryPoint *entry_points
+);
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_shader_build.c b/src/gallium/state_trackers/g3dvl/vl_shader_build.c
new file mode 100644
index 00000000000..51f1721a332
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_shader_build.c
@@ -0,0 +1,215 @@
+#include "vl_shader_build.h"
+#include <assert.h>
+#include <tgsi/tgsi_parse.h>
+#include <tgsi/tgsi_build.h>
+
+struct tgsi_full_declaration vl_decl_input(unsigned int name, unsigned int index, unsigned int first, unsigned int last)
+{
+	struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+	decl.Declaration.File = TGSI_FILE_INPUT;
+	decl.Declaration.Semantic = 1;
+	decl.Semantic.SemanticName = name;
+	decl.Semantic.SemanticIndex = index;
+	decl.DeclarationRange.First = first;
+	decl.DeclarationRange.Last = last;
+
+	return decl;
+}
+
+struct tgsi_full_declaration vl_decl_interpolated_input
+(
+	unsigned int name,
+	unsigned int index,
+	unsigned int first,
+	unsigned int last,
+	int interpolation
+)
+{
+	struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+	assert
+	(
+		interpolation == TGSI_INTERPOLATE_CONSTANT ||
+		interpolation == TGSI_INTERPOLATE_LINEAR ||
+		interpolation == TGSI_INTERPOLATE_PERSPECTIVE
+	);
+
+	decl.Declaration.File = TGSI_FILE_INPUT;
+	decl.Declaration.Semantic = 1;
+	decl.Semantic.SemanticName = name;
+	decl.Semantic.SemanticIndex = index;
+	decl.Declaration.Interpolate = interpolation;;
+	decl.DeclarationRange.First = first;
+	decl.DeclarationRange.Last = last;
+
+	return decl;
+}
+
+struct tgsi_full_declaration vl_decl_constants(unsigned int name, unsigned int index, unsigned int first, unsigned int last)
+{
+	struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+	decl.Declaration.File = TGSI_FILE_CONSTANT;
+	decl.Declaration.Semantic = 1;
+	decl.Semantic.SemanticName = name;
+	decl.Semantic.SemanticIndex = index;
+	decl.DeclarationRange.First = first;
+	decl.DeclarationRange.Last = last;
+
+	return decl;
+}
+
+struct tgsi_full_declaration vl_decl_output(unsigned int name, unsigned int index, unsigned int first, unsigned int last)
+{
+	struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+	decl.Declaration.File = TGSI_FILE_OUTPUT;
+	decl.Declaration.Semantic = 1;
+	decl.Semantic.SemanticName = name;
+	decl.Semantic.SemanticIndex = index;
+	decl.DeclarationRange.First = first;
+	decl.DeclarationRange.Last = last;
+
+	return decl;
+}
+
+struct tgsi_full_declaration vl_decl_temps(unsigned int first, unsigned int last)
+{
+	struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+	decl = tgsi_default_full_declaration();
+	decl.Declaration.File = TGSI_FILE_TEMPORARY;
+	decl.DeclarationRange.First = first;
+	decl.DeclarationRange.Last = last;
+
+	return decl;
+}
+
+struct tgsi_full_declaration vl_decl_samplers(unsigned int first, unsigned int last)
+{
+	struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+	decl = tgsi_default_full_declaration();
+	decl.Declaration.File = TGSI_FILE_SAMPLER;
+	decl.DeclarationRange.First = first;
+	decl.DeclarationRange.Last = last;
+
+	return decl;
+}
+
+struct tgsi_full_instruction vl_inst2
+(
+	int opcode,
+	enum tgsi_file_type dst_file,
+	unsigned int dst_index,
+	enum tgsi_file_type src_file,
+	unsigned int src_index
+)
+{
+	struct tgsi_full_instruction inst = tgsi_default_full_instruction();
+
+	inst.Instruction.Opcode = opcode;
+	inst.Instruction.NumDstRegs = 1;
+	inst.FullDstRegisters[0].DstRegister.File = dst_file;
+	inst.FullDstRegisters[0].DstRegister.Index = dst_index;
+	inst.Instruction.NumSrcRegs = 1;
+	inst.FullSrcRegisters[0].SrcRegister.File = src_file;
+	inst.FullSrcRegisters[0].SrcRegister.Index = src_index;
+
+	return inst;
+}
+
+struct tgsi_full_instruction vl_inst3
+(
+	int opcode,
+	enum tgsi_file_type dst_file,
+	unsigned int dst_index,
+	enum tgsi_file_type src1_file,
+	unsigned int src1_index,
+	enum tgsi_file_type src2_file,
+	unsigned int src2_index
+)
+{
+	struct tgsi_full_instruction inst = tgsi_default_full_instruction();
+
+	inst.Instruction.Opcode = opcode;
+	inst.Instruction.NumDstRegs = 1;
+	inst.FullDstRegisters[0].DstRegister.File = dst_file;
+	inst.FullDstRegisters[0].DstRegister.Index = dst_index;
+	inst.Instruction.NumSrcRegs = 2;
+	inst.FullSrcRegisters[0].SrcRegister.File = src1_file;
+	inst.FullSrcRegisters[0].SrcRegister.Index = src1_index;
+	inst.FullSrcRegisters[1].SrcRegister.File = src2_file;
+	inst.FullSrcRegisters[1].SrcRegister.Index = src2_index;
+
+	return inst;
+}
+
+struct tgsi_full_instruction vl_tex
+(
+	int tex,
+	enum tgsi_file_type dst_file,
+	unsigned int dst_index,
+	enum tgsi_file_type src1_file,
+	unsigned int src1_index,
+	enum tgsi_file_type src2_file,
+	unsigned int src2_index
+)
+{
+	struct tgsi_full_instruction inst = tgsi_default_full_instruction();
+
+	inst.Instruction.Opcode = TGSI_OPCODE_TEX;
+	inst.Instruction.NumDstRegs = 1;
+	inst.FullDstRegisters[0].DstRegister.File = dst_file;
+	inst.FullDstRegisters[0].DstRegister.Index = dst_index;
+	inst.Instruction.NumSrcRegs = 2;
+	inst.InstructionExtTexture.Texture = tex;
+	inst.FullSrcRegisters[0].SrcRegister.File = src1_file;
+	inst.FullSrcRegisters[0].SrcRegister.Index = src1_index;
+	inst.FullSrcRegisters[1].SrcRegister.File = src2_file;
+	inst.FullSrcRegisters[1].SrcRegister.Index = src2_index;
+
+	return inst;
+}
+
+struct tgsi_full_instruction vl_inst4
+(
+	int opcode,
+	enum tgsi_file_type dst_file,
+	unsigned int dst_index,
+	enum tgsi_file_type src1_file,
+	unsigned int src1_index,
+	enum tgsi_file_type src2_file,
+	unsigned int src2_index,
+	enum tgsi_file_type src3_file,
+	unsigned int src3_index
+)
+{
+	struct tgsi_full_instruction inst = tgsi_default_full_instruction();
+
+	inst.Instruction.Opcode = opcode;
+	inst.Instruction.NumDstRegs = 1;
+	inst.FullDstRegisters[0].DstRegister.File = dst_file;
+	inst.FullDstRegisters[0].DstRegister.Index = dst_index;
+	inst.Instruction.NumSrcRegs = 3;
+	inst.FullSrcRegisters[0].SrcRegister.File = src1_file;
+	inst.FullSrcRegisters[0].SrcRegister.Index = src1_index;
+	inst.FullSrcRegisters[1].SrcRegister.File = src2_file;
+	inst.FullSrcRegisters[1].SrcRegister.Index = src2_index;
+	inst.FullSrcRegisters[2].SrcRegister.File = src3_file;
+	inst.FullSrcRegisters[2].SrcRegister.Index = src3_index;
+
+	return inst;
+}
+
+struct tgsi_full_instruction vl_end(void)
+{
+	struct tgsi_full_instruction inst = tgsi_default_full_instruction();
+
+	inst.Instruction.Opcode = TGSI_OPCODE_END;
+	inst.Instruction.NumDstRegs = 0;
+	inst.Instruction.NumSrcRegs = 0;
+
+	return inst;
+}
diff --git a/src/gallium/state_trackers/g3dvl/vl_shader_build.h b/src/gallium/state_trackers/g3dvl/vl_shader_build.h
new file mode 100644
index 00000000000..dc615cb1566
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_shader_build.h
@@ -0,0 +1,61 @@
+#ifndef vl_shader_build_h
+#define vl_shader_build_h
+
+#include <pipe/p_shader_tokens.h>
+
+struct tgsi_full_declaration vl_decl_input(unsigned int name, unsigned int index, unsigned int first, unsigned int last);
+struct tgsi_full_declaration vl_decl_interpolated_input
+(
+	unsigned int name,
+	unsigned int index,
+	unsigned int first,
+	unsigned int last,
+	int interpolation
+);
+struct tgsi_full_declaration vl_decl_constants(unsigned int name, unsigned int index, unsigned int first, unsigned int last);
+struct tgsi_full_declaration vl_decl_output(unsigned int name, unsigned int index, unsigned int first, unsigned int last);
+struct tgsi_full_declaration vl_decl_temps(unsigned int first, unsigned int last);
+struct tgsi_full_declaration vl_decl_samplers(unsigned int first, unsigned int last);
+struct tgsi_full_instruction vl_inst2
+(
+	int opcode,
+	enum tgsi_file_type dst_file,
+	unsigned int dst_index,
+	enum tgsi_file_type src_file,
+	unsigned int src_index
+);
+struct tgsi_full_instruction vl_inst3
+(
+	int opcode,
+	enum tgsi_file_type dst_file,
+	unsigned int dst_index,
+	enum tgsi_file_type src1_file,
+	unsigned int src1_index,
+	enum tgsi_file_type src2_file,
+	unsigned int src2_index
+);
+struct tgsi_full_instruction vl_tex
+(
+	int tex,
+	enum tgsi_file_type dst_file,
+	unsigned int dst_index,
+	enum tgsi_file_type src1_file,
+	unsigned int src1_index,
+	enum tgsi_file_type src2_file,
+	unsigned int src2_index
+);
+struct tgsi_full_instruction vl_inst4
+(
+	int opcode,
+	enum tgsi_file_type dst_file,
+	unsigned int dst_index,
+	enum tgsi_file_type src1_file,
+	unsigned int src1_index,
+	enum tgsi_file_type src2_file,
+	unsigned int src2_index,
+	enum tgsi_file_type src3_file,
+	unsigned int src3_index
+);
+struct tgsi_full_instruction vl_end(void);
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_surface.c b/src/gallium/state_trackers/g3dvl/vl_surface.c
new file mode 100644
index 00000000000..076bd40d414
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_surface.c
@@ -0,0 +1,237 @@
+#define VL_INTERNAL
+#include "vl_surface.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pipe/p_screen.h>
+#include <pipe/p_state.h>
+#include <pipe/p_inlines.h>
+#include <vl_winsys.h>
+#include "vl_screen.h"
+#include "vl_context.h"
+#include "vl_render.h"
+#include "vl_csc.h"
+#include "vl_util.h"
+
+int vlCreateSurface
+(
+	struct vlScreen *screen,
+	unsigned int width,
+	unsigned int height,
+	enum vlFormat format,
+	struct vlSurface **surface
+)
+{
+	struct vlSurface	*sfc;
+	struct pipe_texture	template;
+
+	assert(screen);
+	assert(surface);
+
+	sfc = calloc(1, sizeof(struct vlSurface));
+
+	if (!sfc)
+		return 1;
+
+	sfc->screen = screen;
+	sfc->width = width;
+	sfc->height = height;
+	sfc->format = format;
+
+	memset(&template, 0, sizeof(struct pipe_texture));
+	template.target = PIPE_TEXTURE_2D;
+	template.format = PIPE_FORMAT_A8R8G8B8_UNORM;
+	template.last_level = 0;
+	template.width[0] = vlRoundUpPOT(sfc->width);
+	template.height[0] = vlRoundUpPOT(sfc->height);
+	template.depth[0] = 1;
+	template.compressed = 0;
+	pf_get_block(template.format, &template.block);
+	template.tex_usage = PIPE_TEXTURE_USAGE_SAMPLER | PIPE_TEXTURE_USAGE_RENDER_TARGET;
+
+	sfc->texture = vlGetPipeScreen(screen)->texture_create(vlGetPipeScreen(screen), &template);
+
+	*surface = sfc;
+
+	return 0;
+}
+
+int vlDestroySurface
+(
+	struct vlSurface *surface
+)
+{
+	assert(surface);
+
+	pipe_texture_release(&surface->texture);
+	free(surface);
+
+	return 0;
+}
+
+int vlRenderMacroBlocksMpeg2
+(
+	struct vlMpeg2MacroBlockBatch *batch,
+	struct vlSurface *surface
+)
+{
+	assert(batch);
+	assert(surface);
+	assert(surface->context);
+
+	surface->context->render->vlBegin(surface->context->render);
+
+	surface->context->render->vlRenderMacroBlocksMpeg2
+	(
+		surface->context->render,
+		batch,
+		surface
+	);
+
+	surface->context->render->vlEnd(surface->context->render);
+
+	return 0;
+}
+
+int vlPutPicture
+(
+	struct vlSurface *surface,
+	vlNativeDrawable drawable,
+	int srcx,
+	int srcy,
+	int srcw,
+	int srch,
+	int destx,
+	int desty,
+	int destw,
+	int desth,
+	int drawable_w,
+	int drawable_h,
+	enum vlPictureType picture_type
+)
+{
+	struct vlCSC		*csc;
+	struct pipe_context	*pipe;
+
+	assert(surface);
+	assert(surface->context);
+
+	surface->context->render->vlFlush(surface->context->render);
+
+	csc = surface->context->csc;
+	pipe = surface->context->pipe;
+
+	csc->vlResizeFrameBuffer(csc, drawable_w, drawable_h);
+
+	csc->vlBegin(csc);
+
+	csc->vlPutPicture
+	(
+		csc,
+		surface,
+		srcx,
+		srcy,
+		srcw,
+		srch,
+		destx,
+		desty,
+		destw,
+		desth,
+		picture_type
+	);
+
+	csc->vlEnd(csc);
+
+	pipe->flush(pipe, PIPE_FLUSH_RENDER_CACHE, &surface->disp_fence);
+
+	bind_pipe_drawable(pipe, drawable);
+
+	pipe->winsys->flush_frontbuffer
+	(
+		pipe->winsys,
+		csc->vlGetFrameBuffer(csc),
+		pipe->priv
+	);
+
+	return 0;
+}
+
+int vlSurfaceGetStatus
+(
+	struct vlSurface *surface,
+	enum vlResourceStatus *status
+)
+{
+	assert(surface);
+	assert(surface->context);
+	assert(status);
+
+	if (surface->render_fence && !surface->context->pipe->winsys->fence_signalled(surface->context->pipe->winsys, surface->render_fence, 0))
+	{
+		*status = vlResourceStatusRendering;
+		return 0;
+	}
+
+	if (surface->disp_fence && !surface->context->pipe->winsys->fence_signalled(surface->context->pipe->winsys, surface->disp_fence, 0))
+	{
+		*status = vlResourceStatusDisplaying;
+		return 0;
+	}
+
+	*status = vlResourceStatusFree;
+
+	return 0;
+}
+
+int vlSurfaceFlush
+(
+	struct vlSurface *surface
+)
+{
+	assert(surface);
+	assert(surface->context);
+
+	surface->context->render->vlFlush(surface->context->render);
+
+	return 0;
+}
+
+int vlSurfaceSync
+(
+	struct vlSurface *surface
+)
+{
+	assert(surface);
+	assert(surface->context);
+	assert(surface->render_fence);
+
+	surface->context->pipe->winsys->fence_finish(surface->context->pipe->winsys, surface->render_fence, 0);
+
+	return 0;
+}
+
+struct vlScreen* vlSurfaceGetScreen
+(
+	struct vlSurface *surface
+)
+{
+	assert(surface);
+
+	return surface->screen;
+}
+
+struct vlContext* vlBindToContext
+(
+	struct vlSurface *surface,
+	struct vlContext *context
+)
+{
+	struct vlContext *old;
+
+	assert(surface);
+
+	old = surface->context;
+	surface->context = context;
+
+	return old;
+}
diff --git a/src/gallium/state_trackers/g3dvl/vl_surface.h b/src/gallium/state_trackers/g3dvl/vl_surface.h
new file mode 100644
index 00000000000..133e1515ef3
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_surface.h
@@ -0,0 +1,86 @@
+#ifndef vl_surface_h
+#define vl_surface_h
+
+#include "vl_types.h"
+
+#ifdef VL_INTERNAL
+struct pipe_texture;
+
+struct vlSurface
+{
+	struct vlScreen			*screen;
+	struct vlContext		*context;
+	unsigned int			width;
+	unsigned int			height;
+	enum vlFormat			format;
+	struct pipe_texture		*texture;
+	struct pipe_fence_handle	*render_fence;
+	struct pipe_fence_handle	*disp_fence;
+};
+#endif
+
+int vlCreateSurface
+(
+	struct vlScreen *screen,
+	unsigned int width,
+	unsigned int height,
+	enum vlFormat format,
+	struct vlSurface **surface
+);
+
+int vlDestroySurface
+(
+	struct vlSurface *surface
+);
+
+int vlRenderMacroBlocksMpeg2
+(
+	struct vlMpeg2MacroBlockBatch *batch,
+	struct vlSurface *surface
+);
+
+int vlPutPicture
+(
+	struct vlSurface *surface,
+	vlNativeDrawable drawable,
+	int srcx,
+	int srcy,
+	int srcw,
+	int srch,
+	int destx,
+	int desty,
+	int destw,
+	int desth,
+	int drawable_w,
+	int drawable_h,
+	enum vlPictureType picture_type
+);
+
+int vlSurfaceGetStatus
+(
+	struct vlSurface *surface,
+	enum vlResourceStatus *status
+);
+
+int vlSurfaceFlush
+(
+	struct vlSurface *surface
+);
+
+int vlSurfaceSync
+(
+	struct vlSurface *surface
+);
+
+struct vlScreen* vlSurfaceGetScreen
+(
+	struct vlSurface *surface
+);
+
+struct vlContext* vlBindToContext
+(
+	struct vlSurface *surface,
+	struct vlContext *context
+);
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_types.h b/src/gallium/state_trackers/g3dvl/vl_types.h
new file mode 100644
index 00000000000..274e1f74377
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_types.h
@@ -0,0 +1,115 @@
+#ifndef vl_types_h
+#define vl_types_h
+
+#if 1 /*#ifdef X11*/
+#include <X11/Xlib.h>
+
+typedef Display* vlNativeDisplay;
+typedef Drawable vlNativeDrawable;
+#endif
+
+struct vlDisplay;
+struct vlScreen;
+struct vlContext;
+struct vlSurface;
+
+enum vlResourceStatus
+{
+	vlResourceStatusFree,
+	vlResourceStatusRendering,
+	vlResourceStatusDisplaying
+};
+
+enum vlProfile
+{
+	vlProfileMpeg2Simple,
+	vlProfileMpeg2Main,
+
+	vlProfileCount
+};
+
+enum vlEntryPoint
+{
+	vlEntryPointIDCT,
+	vlEntryPointMC,
+	vlEntryPointCSC,
+
+	vlEntryPointCount
+};
+
+enum vlFormat
+{
+	vlFormatYCbCr420,
+	vlFormatYCbCr422,
+	vlFormatYCbCr444
+};
+
+enum vlPictureType
+{
+	vlPictureTypeTopField,
+	vlPictureTypeBottomField,
+	vlPictureTypeFrame
+};
+
+enum vlMotionType
+{
+	vlMotionTypeField,
+	vlMotionTypeFrame,
+	vlMotionTypeDualPrime,
+	vlMotionType16x8
+};
+
+enum vlFieldOrder
+{
+	vlFieldOrderFirst,
+	vlFieldOrderSecond
+};
+
+enum vlDCTType
+{
+	vlDCTTypeFrameCoded,
+	vlDCTTypeFieldCoded
+};
+
+struct vlVertex2f
+{
+	float x, y;
+};
+
+struct vlVertex4f
+{
+	float x, y, z, w;
+};
+
+enum vlMacroBlockType
+{
+	vlMacroBlockTypeIntra,
+	vlMacroBlockTypeFwdPredicted,
+	vlMacroBlockTypeBkwdPredicted,
+	vlMacroBlockTypeBiPredicted,
+
+	vlNumMacroBlockTypes
+};
+
+struct vlMpeg2MacroBlock
+{
+	unsigned int		mbx, mby;
+	enum vlMacroBlockType	mb_type;
+	enum vlMotionType	mo_type;
+	enum vlDCTType		dct_type;
+	int			PMV[2][2][2];
+	unsigned int		cbp;
+	short			*blocks;
+};
+
+struct vlMpeg2MacroBlockBatch
+{
+	struct vlSurface		*past_surface;
+	struct vlSurface		*future_surface;
+	enum vlPictureType		picture_type;
+	enum vlFieldOrder		field_order;
+	unsigned int			num_macroblocks;
+	struct vlMpeg2MacroBlock	*macroblocks;
+};
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_util.c b/src/gallium/state_trackers/g3dvl/vl_util.c
new file mode 100644
index 00000000000..50aa9af66f2
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_util.c
@@ -0,0 +1,16 @@
+#include "vl_util.h"
+#include <assert.h>
+
+unsigned int vlRoundUpPOT(unsigned int x)
+{
+	unsigned int i;
+
+	assert(x > 0);
+
+	--x;
+
+	for (i = 1; i < sizeof(unsigned int) * 8; i <<= 1)
+		x |= x >> i;
+
+	return x + 1;
+}
diff --git a/src/gallium/state_trackers/g3dvl/vl_util.h b/src/gallium/state_trackers/g3dvl/vl_util.h
new file mode 100644
index 00000000000..bc98e79df47
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_util.h
@@ -0,0 +1,6 @@
+#ifndef vl_util_h
+#define vl_util_h
+
+unsigned int vlRoundUpPOT(unsigned int x);
+
+#endif
diff --git a/src/gallium/winsys/drm/intel/dri/intel_screen.c b/src/gallium/winsys/drm/intel/dri/intel_screen.c
index 3a486481f56..ed753689829 100644
--- a/src/gallium/winsys/drm/intel/dri/intel_screen.c
+++ b/src/gallium/winsys/drm/intel/dri/intel_screen.c
@@ -113,7 +113,118 @@ static PFNGLXCREATECONTEXTMODES create_context_modes = NULL;
 
 extern const struct dri_extension card_extensions[];
 
+static GLboolean
+intel_get_param(__DRIscreenPrivate *psp, int param, int *value)
+{
+   int ret;
+   struct drm_i915_getparam gp;
+
+   gp.param = param;
+   gp.value = value;
+
+   ret = drmCommandWriteRead(psp->fd, DRM_I915_GETPARAM, &gp, sizeof(gp));
+   if (ret) {
+      fprintf(stderr, "drm_i915_getparam: %d\n", ret);
+      return GL_FALSE;
+   }
+
+   return GL_TRUE;
+}
+
+static void
+intelSetTexOffset(__DRIcontext *pDRICtx, int texname,
+		  unsigned long long offset, int depth, uint pitch)
+{
+   abort();
+#if 0
+   struct intel_context *intel = (struct intel_context*)
+      ((__DRIcontextPrivate*)pDRICtx->private)->driverPrivate;
+   struct gl_texture_object *tObj = _mesa_lookup_texture(&intel->ctx, texname);
+   struct st_texture_object *stObj = st_texture_object(tObj);
+
+   if (!stObj)
+      return;
+
+   if (stObj->pt)
+      st->pipe->texture_release(intel->st->pipe, &stObj->pt);
+
+   stObj->imageOverride = GL_TRUE;
+   stObj->depthOverride = depth;
+   stObj->pitchOverride = pitch;
+
+   if (offset)
+      stObj->textureOffset = offset;
+#endif
+}
+
+
+#if 0
+static void
+intelHandleDrawableConfig(__DRIdrawablePrivate *dPriv,
+			  __DRIcontextPrivate *pcp,
+			  __DRIDrawableConfigEvent *event)
+{
+   (void) dPriv;
+   (void) pcp;
+   (void) event;
+}
+#endif
+
+#if 0
+static void
+intelHandleBufferAttach(__DRIdrawablePrivate *dPriv,
+			__DRIcontextPrivate *pcp,
+			__DRIBufferAttachEvent *ba)
+{
+   struct intel_screen *intelScreen = intel_screen(dPriv->driScreenPriv);
+
+   switch (ba->buffer.attachment) {
+   case DRI_DRAWABLE_BUFFER_FRONT_LEFT:
+      intelScreen->front.width = dPriv->w;
+      intelScreen->front.height = dPriv->h;
+      intelScreen->front.cpp = ba->buffer.cpp;
+      intelScreen->front.pitch = ba->buffer.pitch;
+      driGenBuffers(intelScreen->base.staticPool, "front", 1, &intelScreen->front.buffer, 0, 0, 0);
+      driBOSetReferenced(intelScreen->front.buffer, ba->buffer.handle);
+      break;
+
+   case DRI_DRAWABLE_BUFFER_BACK_LEFT:
+   case DRI_DRAWABLE_BUFFER_DEPTH:
+   case DRI_DRAWABLE_BUFFER_STENCIL:
+   case DRI_DRAWABLE_BUFFER_ACCUM:
+      /* anything ?? */
+      break;
+
+   default:
+      fprintf(stderr, "unhandled buffer attach event, attachment type %d\n",
+	      ba->buffer.attachment);
+      return;
+   }
+}
+#endif
+
+static const __DRItexOffsetExtension intelTexOffsetExtension = {
+   { __DRI_TEX_OFFSET },
+   intelSetTexOffset,
+};
+
+#if 0
+static const __DRItexBufferExtension intelTexBufferExtension = {
+    { __DRI_TEX_BUFFER, __DRI_TEX_BUFFER_VERSION },
+   intelSetTexBuffer,
+};
+#endif
 
+static const __DRIextension *intelScreenExtensions[] = {
+    &driReadDrawableExtension,
+    &driCopySubBufferExtension.base,
+    &driSwapControlExtension.base,
+    &driFrameTrackingExtension.base,
+    &driMediaStreamCounterExtension.base,
+    &intelTexOffsetExtension.base,
+//    &intelTexBufferExtension.base,
+    NULL
+};
 
 
 static void
@@ -232,7 +343,8 @@ intelCreatePools(__DRIscreenPrivate * sPriv)
 
    intelScreen->havePools = GL_TRUE;
 
-   intelUpdateScreenRotation(sPriv, intelScreen->sarea);
+   if (intelScreen->sarea)
+	intelUpdateScreenRotation(sPriv, intelScreen->sarea);
 
    return GL_TRUE;
 }
@@ -265,11 +377,6 @@ intelInitDriver(__DRIscreenPrivate * sPriv)
    struct intel_screen *intelScreen;
    I830DRIPtr gDRIPriv = (I830DRIPtr) sPriv->pDevPriv;
 
-   PFNGLXSCRENABLEEXTENSIONPROC glx_enable_extension =
-      (PFNGLXSCRENABLEEXTENSIONPROC) (*dri_interface->
-                                      getProcAddress("glxEnableExtension"));
-   void *const psc = sPriv->psc->screenConfigs;
-
    if (sPriv->devPrivSize != sizeof(I830DRIRec)) {
       fprintf(stderr,
               "\nERROR!  sizeof(I830DRIRec) does not match passed size from device driver\n");
@@ -286,28 +393,19 @@ intelInitDriver(__DRIscreenPrivate * sPriv)
                       __driConfigOptions, __driNConfigOptions);
 
    sPriv->private = (void *) intelScreen;
-
    intelScreen->sarea = (drmI830Sarea *) (((GLubyte *) sPriv->pSAREA) +
-					  gDRIPriv->sarea_priv_offset);
-   intelScreen->deviceID = gDRIPriv->deviceID;
-   intelScreen->front.cpp = gDRIPriv->cpp;
-   intelScreen->drmMinor = sPriv->drmMinor;
+                                            gDRIPriv->sarea_priv_offset);
 
-   assert(gDRIPriv->bitsPerPixel == 16 ||
-	  gDRIPriv->bitsPerPixel == 32);
+   intelScreen->deviceID = gDRIPriv->deviceID;
 
+   intelScreen->front.cpp = gDRIPriv->cpp;
+   intelScreen->drmMinor = sPriv->drm_version.minor;
    intelUpdateScreenRotation(sPriv, intelScreen->sarea);
 
    if (0)
       intelPrintDRIInfo(intelScreen, sPriv, gDRIPriv);
 
-   if (glx_enable_extension != NULL) {
-      (*glx_enable_extension) (psc, "GLX_SGI_swap_control");
-      (*glx_enable_extension) (psc, "GLX_SGI_video_sync");
-      (*glx_enable_extension) (psc, "GLX_MESA_swap_control");
-      (*glx_enable_extension) (psc, "GLX_MESA_swap_frame_usage");
-      (*glx_enable_extension) (psc, "GLX_SGI_make_current_read");
-   }
+   sPriv->extensions = intelScreenExtensions;
 
    intelScreen->base.base.flush_frontbuffer = intel_flush_frontbuffer;
    intelScreen->base.base.get_name = intel_get_name;
@@ -387,7 +485,7 @@ intelDestroyBuffer(__DRIdrawablePrivate * driDrawPriv)
 {
    struct intel_framebuffer *intelfb = intel_framebuffer(driDrawPriv);
    assert(intelfb->stfb);
-   st_unreference_framebuffer(&intelfb->stfb);
+   st_unreference_framebuffer(intelfb->stfb);
    free(intelfb);
 }
 
@@ -406,65 +504,19 @@ intelGetSwapInfo(__DRIdrawablePrivate * dPriv, __DRIswapInfo * sInfo)
    return 0;
 }
 
-
-static void
-intelSetTexOffset(__DRIcontext *pDRICtx, int texname,
-		  unsigned long long offset, int depth, uint pitch)
-{
-   abort();
-#if 0
-   struct intel_context *intel = (struct intel_context*)
-      ((__DRIcontextPrivate*)pDRICtx->private)->driverPrivate;
-   struct gl_texture_object *tObj = _mesa_lookup_texture(&intel->ctx, texname);
-   struct st_texture_object *stObj = st_texture_object(tObj);
-
-   if (!stObj)
-      return;
-
-   if (stObj->pt)
-      st->pipe->texture_release(intel->st->pipe, &stObj->pt);
-
-   stObj->imageOverride = GL_TRUE;
-   stObj->depthOverride = depth;
-   stObj->pitchOverride = pitch;
-
-   if (offset)
-      stObj->textureOffset = offset;
-#endif
-}
-
-
-static const struct __DriverAPIRec intelAPI = {
-   .InitDriver = intelInitDriver,
-   .DestroyScreen = intelDestroyScreen,
-   .CreateContext = intelCreateContext,
-   .DestroyContext = intelDestroyContext,
-   .CreateBuffer = intelCreateBuffer,
-   .DestroyBuffer = intelDestroyBuffer,
-   .SwapBuffers = intelSwapBuffers,
-   .MakeCurrent = intelMakeCurrent,
-   .UnbindContext = intelUnbindContext,
-   .GetSwapInfo = intelGetSwapInfo,
-   .GetMSC = driGetMSC32,
-   .WaitForMSC = driWaitForMSC32,
-   .WaitForSBC = NULL,
-   .SwapBuffersMSC = NULL,
-   .CopySubBuffer = intelCopySubBuffer,
-   .setTexOffset = intelSetTexOffset,
-};
-
-
-static __GLcontextModes *
-intelFillInModes(unsigned pixel_bits, unsigned depth_bits,
-                 unsigned stencil_bits, boolean have_back_buffer)
+static __DRIconfig **
+intelFillInModes(__DRIscreenPrivate *psp,
+		 unsigned pixel_bits, unsigned depth_bits,
+                 unsigned stencil_bits, GLboolean have_back_buffer)
 {
-   __GLcontextModes *modes;
+   __DRIconfig **configs;
    __GLcontextModes *m;
    unsigned num_modes;
    unsigned depth_buffer_factor;
    unsigned back_buffer_factor;
    GLenum fb_format;
    GLenum fb_type;
+   int i;
 
    /* GLX_SWAP_COPY_OML is only supported because the Intel driver doesn't
     * support pageflipping at all.
@@ -508,100 +560,144 @@ intelFillInModes(unsigned pixel_bits, unsigned depth_bits,
       fb_type = GL_UNSIGNED_INT_8_8_8_8_REV;
    }
 
-   modes =
-      (*dri_interface->createContextModes) (num_modes,
-                                            sizeof(__GLcontextModes));
-   m = modes;
-   if (!driFillInModes(&m, fb_format, fb_type,
-                       depth_bits_array, stencil_bits_array,
-                       depth_buffer_factor, back_buffer_modes,
-                       back_buffer_factor, msaa_samples_array, 1, GLX_TRUE_COLOR)) {
-      fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__,
-              __LINE__);
-      return NULL;
-   }
-   if (!driFillInModes(&m, fb_format, fb_type,
-                       depth_bits_array, stencil_bits_array,
-                       depth_buffer_factor, back_buffer_modes,
-                       back_buffer_factor, msaa_samples_array, 1, GLX_DIRECT_COLOR)) {
-      fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__,
+   configs = driCreateConfigs(fb_format, fb_type,
+			      depth_bits_array, stencil_bits_array,
+			      depth_buffer_factor, back_buffer_modes,
+			      back_buffer_factor, msaa_samples_array, 1);
+   if (configs == NULL) {
+    fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__,
               __LINE__);
       return NULL;
    }
 
    /* Mark the visual as slow if there are "fake" stencil bits.
     */
-   for (m = modes; m != NULL; m = m->next) {
+   for (i = 0; configs[i]; i++) {
+      m = &configs[i]->modes;
       if ((m->stencilBits != 0) && (m->stencilBits != stencil_bits)) {
          m->visualRating = GLX_SLOW_CONFIG;
       }
    }
 
-   return modes;
+   return configs;
 }
 
-
 /**
- * This is the bootstrap function for the driver.  libGL supplies all of the
- * requisite information about the system, and the driver initializes itself.
- * This routine also fills in the linked list pointed to by \c driver_modes
- * with the \c __GLcontextModes that the driver can support for windows or
- * pbuffers.
+ * This is the driver specific part of the createNewScreen entry point.
+ * 
+ * \todo maybe fold this into intelInitDriver
  *
- * \return A pointer to a \c __DRIscreenPrivate on success, or \c NULL on
- *         failure.
+ * \return the __GLcontextModes supported by this driver
  */
-PUBLIC void *
-__driCreateNewScreen_20050727(__DRInativeDisplay * dpy, int scrn,
-                              __DRIscreen * psc,
-                              const __GLcontextModes * modes,
-                              const __DRIversion * ddx_version,
-                              const __DRIversion * dri_version,
-                              const __DRIversion * drm_version,
-                              const __DRIframebuffer * frame_buffer,
-                              drmAddress pSAREA, int fd,
-                              int internal_api_version,
-                              const __DRIinterfaceMethods * interface,
-                              __GLcontextModes ** driver_modes)
+static const __DRIconfig **intelInitScreen(__DRIscreenPrivate *psp)
 {
-   __DRIscreenPrivate *psp;
-   static const __DRIversion ddx_expected = { 1, 7, 0 };
+#ifdef I915
+   static const __DRIversion ddx_expected = { 1, 5, 0 };
+#else
+   static const __DRIversion ddx_expected = { 1, 6, 0 };
+#endif
    static const __DRIversion dri_expected = { 4, 0, 0 };
-   static const __DRIversion drm_expected = { 1, 7, 0 };
-
-   dri_interface = interface;
+   static const __DRIversion drm_expected = { 1, 5, 0 };
+   I830DRIPtr dri_priv = (I830DRIPtr) psp->pDevPriv;
 
    if (!driCheckDriDdxDrmVersions2("i915",
-                                   dri_version, &dri_expected,
-                                   ddx_version, &ddx_expected,
-                                   drm_version, &drm_expected)) {
+                                   &psp->dri_version, &dri_expected,
+                                   &psp->ddx_version, &ddx_expected,
+                                   &psp->drm_version, &drm_expected)) {
       return NULL;
    }
 
-   psp = __driUtilCreateNewScreen(dpy, scrn, psc, NULL,
-                                  ddx_version, dri_version, drm_version,
-                                  frame_buffer, pSAREA, fd,
-                                  internal_api_version, &intelAPI);
-
-   if (psp != NULL) {
-      I830DRIPtr dri_priv = (I830DRIPtr) psp->pDevPriv;
-      *driver_modes = intelFillInModes(dri_priv->cpp * 8,
-                                       (dri_priv->cpp == 2) ? 16 : 24,
-                                       (dri_priv->cpp == 2) ? 0 : 8, 1);
-
-      /* Calling driInitExtensions here, with a NULL context pointer,
-       * does not actually enable the extensions.  It just makes sure
-       * that all the dispatch offsets for all the extensions that
-       * *might* be enables are known.  This is needed because the
-       * dispatch offsets need to be known when _mesa_context_create
-       * is called, but we can't enable the extensions until we have a
-       * context pointer.
-       *
-       * Hello chicken.  Hello egg.  How are you two today?
-       */
-      driInitExtensions(NULL, card_extensions, GL_FALSE);
+   /* Calling driInitExtensions here, with a NULL context pointer,
+    * does not actually enable the extensions.  It just makes sure
+    * that all the dispatch offsets for all the extensions that
+    * *might* be enables are known.  This is needed because the
+    * dispatch offsets need to be known when _mesa_context_create is
+    * called, but we can't enable the extensions until we have a
+    * context pointer.
+    *
+    * Hello chicken.  Hello egg.  How are you two today?
+    */
+   driInitExtensions( NULL, card_extensions, GL_FALSE );
+   //intelInitExtensions(NULL, GL_TRUE);
+	   
+   if (!intelInitDriver(psp))
+       return NULL;
+
+   psp->extensions = intelScreenExtensions;
+
+   return (const __DRIconfig **)
+       intelFillInModes(psp, dri_priv->cpp * 8,
+			(dri_priv->cpp == 2) ? 16 : 24,
+			(dri_priv->cpp == 2) ? 0  : 8, 1);
+}
+
+/**
+ * This is the driver specific part of the createNewScreen entry point.
+ * 
+ * \return the __GLcontextModes supported by this driver
+ */
+static const
+__DRIconfig **intelInitScreen2(__DRIscreenPrivate *psp)
+{
+   struct intel_screen *intelScreen;
+
+   /* Calling driInitExtensions here, with a NULL context pointer,
+    * does not actually enable the extensions.  It just makes sure
+    * that all the dispatch offsets for all the extensions that
+    * *might* be enables are known.  This is needed because the
+    * dispatch offsets need to be known when _mesa_context_create is
+    * called, but we can't enable the extensions until we have a
+    * context pointer.
+    *
+    * Hello chicken.  Hello egg.  How are you two today?
+    */
+   //intelInitExtensions(NULL, GL_TRUE);
+
+   /* Allocate the private area */
+   intelScreen = CALLOC_STRUCT(intel_screen);
+   if (!intelScreen) {
+      fprintf(stderr, "\nERROR!  Allocating private area failed\n");
+      return GL_FALSE;
    }
+   /* parse information in __driConfigOptions */
+   driParseOptionInfo(&intelScreen->optionCache,
+                      __driConfigOptions, __driNConfigOptions);
+
+   psp->private = (void *) intelScreen;
+
+   intelScreen->drmMinor = psp->drm_version.minor;
 
-   return (void *) psp;
+   /* Determine chipset ID? */
+   if (!intel_get_param(psp, I915_PARAM_CHIPSET_ID,
+			&intelScreen->deviceID))
+      return GL_FALSE;
+
+   psp->extensions = intelScreenExtensions;
+
+   intel_be_init_device(&intelScreen->base, psp->fd, intelScreen->deviceID);
+   intelScreen->base.base.flush_frontbuffer = intel_flush_frontbuffer;
+   intelScreen->base.base.get_name = intel_get_name;
+
+   return driConcatConfigs(intelFillInModes(psp, 16, 16, 0, 1),
+			   intelFillInModes(psp, 32, 24, 8, 1));
 }
 
+const struct __DriverAPIRec driDriverAPI = {
+   .InitScreen		 = intelInitScreen,
+   .DestroyScreen	 = intelDestroyScreen,
+   .CreateContext	 = intelCreateContext,
+   .DestroyContext	 = intelDestroyContext,
+   .CreateBuffer	 = intelCreateBuffer,
+   .DestroyBuffer	 = intelDestroyBuffer,
+   .SwapBuffers		 = intelSwapBuffers,
+   .MakeCurrent		 = intelMakeCurrent,
+   .UnbindContext	 = intelUnbindContext,
+   .GetSwapInfo		 = intelGetSwapInfo,
+   .GetDrawableMSC	 = driDrawableGetMSC32,
+   .WaitForMSC		 = driWaitForMSC32,
+   .CopySubBuffer	 = intelCopySubBuffer,
+
+   //.InitScreen2		 = intelInitScreen2,
+   //.HandleDrawableConfig = intelHandleDrawableConfig,
+   //.HandleBufferAttach	 = intelHandleBufferAttach,
+};
diff --git a/src/gallium/winsys/drm/intel/egl/intel_device.c b/src/gallium/winsys/drm/intel/egl/intel_device.c
index b9649cbec71..1964745c994 100644
--- a/src/gallium/winsys/drm/intel/egl/intel_device.c
+++ b/src/gallium/winsys/drm/intel/egl/intel_device.c
@@ -131,7 +131,7 @@ intel_destroy_drawable(struct egl_drm_drawable *drawable)
 	drawable->priv = NULL;
 
 	assert(intelfb->stfb);
-	st_unreference_framebuffer(&intelfb->stfb);
+	st_unreference_framebuffer(intelfb->stfb);
 	free(intelfb);
 	return TRUE;
 }
diff --git a/src/gallium/winsys/drm/intel/egl/intel_egl.c b/src/gallium/winsys/drm/intel/egl/intel_egl.c
index 3204ed3131c..ed464076ee3 100644
--- a/src/gallium/winsys/drm/intel/egl/intel_egl.c
+++ b/src/gallium/winsys/drm/intel/egl/intel_egl.c
@@ -565,7 +565,7 @@ static struct drm_mode_modeinfo *
 drm_find_mode(drmModeConnectorPtr connector, _EGLMode *mode)
 {
 	int i;
-	struct drm_mode_modeinfo *m;
+	struct drm_mode_modeinfo *m = NULL;
 
 	for (i = 0; i < connector->count_modes; i++) {
 		m = &connector->modes[i];
diff --git a/src/gallium/winsys/drm/nouveau/Makefile b/src/gallium/winsys/drm/nouveau/Makefile
new file mode 100644
index 00000000000..81562ca78d3
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/Makefile
@@ -0,0 +1,46 @@
+
+TOP = ../../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nouveau_dri.so
+
+MINIGLX_SOURCES =
+
+PIPE_DRIVERS = \
+	$(TOP)/src/gallium/drivers/softpipe/libsoftpipe.a \
+	$(TOP)/src/gallium/drivers/nv04/libnv04.a \
+	$(TOP)/src/gallium/drivers/nv10/libnv10.a \
+	$(TOP)/src/gallium/drivers/nv20/libnv20.a \
+	$(TOP)/src/gallium/drivers/nv30/libnv30.a \
+	$(TOP)/src/gallium/drivers/nv40/libnv40.a \
+	$(TOP)/src/gallium/drivers/nv50/libnv50.a
+
+DRIVER_SOURCES = \
+	nouveau_bo.c \
+	nouveau_channel.c \
+	nouveau_context.c \
+	nouveau_device.c \
+	nouveau_dma.c \
+	nouveau_fence.c \
+	nouveau_grobj.c \
+	nouveau_lock.c \
+	nouveau_notifier.c \
+	nouveau_pushbuf.c \
+	nouveau_resource.c \
+	nouveau_screen.c \
+	nouveau_swapbuffers.c \
+	nouveau_winsys.c \
+	nouveau_winsys_pipe.c \
+	nouveau_winsys_softpipe.c \
+	nv04_surface.c \
+	nv50_surface.c
+
+C_SOURCES = \
+	$(COMMON_GALLIUM_SOURCES) \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+include ../Makefile.template
+
+symlinks:
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_bo.c b/src/gallium/winsys/drm/nouveau/nouveau_bo.c
new file mode 100644
index 00000000000..b5942994d94
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_bo.c
@@ -0,0 +1,470 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "nouveau_drmif.h"
+#include "nouveau_dma.h"
+#include "nouveau_local.h"
+
+static void
+nouveau_mem_free(struct nouveau_device *dev, struct drm_nouveau_mem_alloc *ma,
+		 void **map)
+{
+	struct nouveau_device_priv *nvdev = nouveau_device(dev);
+	struct drm_nouveau_mem_free mf;
+
+	if (map && *map) {
+		drmUnmap(*map, ma->size);
+		*map = NULL;
+	}
+
+	if (ma->size) {
+		mf.offset = ma->offset;
+		mf.flags = ma->flags;
+		drmCommandWrite(nvdev->fd, DRM_NOUVEAU_MEM_FREE,
+				&mf, sizeof(mf));
+		ma->size = 0;
+	}
+}
+
+static int
+nouveau_mem_alloc(struct nouveau_device *dev, unsigned size, unsigned align,
+		  uint32_t flags, struct drm_nouveau_mem_alloc *ma, void **map)
+{
+	struct nouveau_device_priv *nvdev = nouveau_device(dev);
+	int ret;
+
+	ma->alignment = align;
+	ma->size = size;
+	ma->flags = flags;
+	if (map)
+		ma->flags |= NOUVEAU_MEM_MAPPED;
+	ret = drmCommandWriteRead(nvdev->fd, DRM_NOUVEAU_MEM_ALLOC, ma,
+				  sizeof(struct drm_nouveau_mem_alloc));
+	if (ret)
+		return ret;
+
+	if (map) {
+		ret = drmMap(nvdev->fd, ma->map_handle, ma->size, map);
+		if (ret) {
+			*map = NULL;
+			nouveau_mem_free(dev, ma, map);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void
+nouveau_bo_tmp_del(void *priv)
+{
+	struct nouveau_resource *r = priv;
+
+	nouveau_fence_ref(NULL, (struct nouveau_fence **)&r->priv);
+	nouveau_resource_free(&r);
+}
+
+static unsigned
+nouveau_bo_tmp_max(struct nouveau_device_priv *nvdev)
+{
+	struct nouveau_resource *r = nvdev->sa_heap;
+	unsigned max = 0;
+
+	while (r) {
+		if (r->in_use && !nouveau_fence(r->priv)->emitted) {
+			r = r->next;
+			continue;
+		}
+
+		if (max < r->size)
+			max = r->size;
+		r = r->next;
+	}
+
+	return max;
+}
+
+static struct nouveau_resource *
+nouveau_bo_tmp(struct nouveau_channel *chan, unsigned size,
+	       struct nouveau_fence *fence)
+{
+	struct nouveau_device_priv *nvdev = nouveau_device(chan->device);
+	struct nouveau_resource *r = NULL;
+	struct nouveau_fence *ref = NULL;
+
+	if (fence)
+		nouveau_fence_ref(fence, &ref);
+	else
+		nouveau_fence_new(chan, &ref);
+	assert(ref);
+
+	while (nouveau_resource_alloc(nvdev->sa_heap, size, ref, &r)) {
+		if (nouveau_bo_tmp_max(nvdev) < size) {
+			nouveau_fence_ref(NULL, &ref);
+			return NULL;
+		}
+
+		nouveau_fence_flush(chan);
+	}
+	nouveau_fence_signal_cb(ref, nouveau_bo_tmp_del, r);
+
+	return r;
+}
+
+int
+nouveau_bo_init(struct nouveau_device *dev)
+{
+	struct nouveau_device_priv *nvdev = nouveau_device(dev);
+	int ret;
+
+	ret = nouveau_mem_alloc(dev, 128*1024, 0, NOUVEAU_MEM_AGP |
+				NOUVEAU_MEM_PCI, &nvdev->sa, &nvdev->sa_map);
+	if (ret)
+		return ret;
+
+	ret = nouveau_resource_init(&nvdev->sa_heap, 0, nvdev->sa.size);
+	if (ret) {
+		nouveau_mem_free(dev, &nvdev->sa, &nvdev->sa_map);
+		return ret;
+	}
+
+	return 0;
+}
+
+void
+nouveau_bo_takedown(struct nouveau_device *dev)
+{
+	struct nouveau_device_priv *nvdev = nouveau_device(dev);
+
+	nouveau_mem_free(dev, &nvdev->sa, &nvdev->sa_map);
+}
+
+int
+nouveau_bo_new(struct nouveau_device *dev, uint32_t flags, int align,
+	       int size, struct nouveau_bo **bo)
+{
+	struct nouveau_bo_priv *nvbo;
+	int ret;
+
+	if (!dev || !bo || *bo)
+		return -EINVAL;
+
+	nvbo = calloc(1, sizeof(struct nouveau_bo_priv));
+	if (!nvbo)
+		return -ENOMEM;
+	nvbo->base.device = dev;
+	nvbo->base.size = size;
+	nvbo->base.handle = bo_to_ptr(nvbo);
+	nvbo->drm.alignment = align;
+	nvbo->refcount = 1;
+
+	if (flags & NOUVEAU_BO_TILED) {
+		nvbo->tiled = 1;
+		if (flags & NOUVEAU_BO_ZTILE)
+			nvbo->tiled |= 2;
+		flags &= ~NOUVEAU_BO_TILED;
+	}
+
+	ret = nouveau_bo_set_status(&nvbo->base, flags);
+	if (ret) {
+		free(nvbo);
+		return ret;
+	}
+
+	*bo = &nvbo->base;
+	return 0;
+}
+
+int
+nouveau_bo_user(struct nouveau_device *dev, void *ptr, int size,
+		struct nouveau_bo **bo)
+{
+	struct nouveau_bo_priv *nvbo;
+
+	if (!dev || !bo || *bo)
+		return -EINVAL;
+
+	nvbo = calloc(1, sizeof(*nvbo));
+	if (!nvbo)
+		return -ENOMEM;
+	nvbo->base.device = dev;
+	
+	nvbo->sysmem = ptr;
+	nvbo->user = 1;
+
+	nvbo->base.size = size;
+	nvbo->base.offset = nvbo->drm.offset;
+	nvbo->base.handle = bo_to_ptr(nvbo);
+	nvbo->refcount = 1;
+	*bo = &nvbo->base;
+	return 0;
+}
+
+int
+nouveau_bo_ref(struct nouveau_device *dev, uint64_t handle,
+	       struct nouveau_bo **bo)
+{
+	struct nouveau_bo_priv *nvbo = ptr_to_bo(handle);
+
+	if (!dev || !bo || *bo)
+		return -EINVAL;
+
+	nvbo->refcount++;
+	*bo = &nvbo->base;
+	return 0;
+}
+
+static void
+nouveau_bo_del_cb(void *priv)
+{
+	struct nouveau_bo_priv *nvbo = priv;
+
+	nouveau_fence_ref(NULL, &nvbo->fence);
+	nouveau_mem_free(nvbo->base.device, &nvbo->drm, &nvbo->map);
+	if (nvbo->sysmem && !nvbo->user)
+		free(nvbo->sysmem);
+	free(nvbo);
+}
+
+void
+nouveau_bo_del(struct nouveau_bo **bo)
+{
+	struct nouveau_bo_priv *nvbo;
+
+	if (!bo || !*bo)
+		return;
+	nvbo = nouveau_bo(*bo);
+	*bo = NULL;
+
+	if (--nvbo->refcount)
+		return;
+
+	if (nvbo->pending)
+		nouveau_pushbuf_flush(nvbo->pending->channel, 0);
+
+	if (nvbo->fence)
+		nouveau_fence_signal_cb(nvbo->fence, nouveau_bo_del_cb, nvbo);
+	else
+		nouveau_bo_del_cb(nvbo);
+}
+
+int
+nouveau_bo_map(struct nouveau_bo *bo, uint32_t flags)
+{
+	struct nouveau_bo_priv *nvbo = nouveau_bo(bo);
+
+	if (!nvbo)
+		return -EINVAL;
+
+	if (nvbo->pending &&
+	    (nvbo->pending->flags & NOUVEAU_BO_WR || flags & NOUVEAU_BO_WR)) {
+		nouveau_pushbuf_flush(nvbo->pending->channel, 0);
+	}
+
+	if (flags & NOUVEAU_BO_WR)
+		nouveau_fence_wait(&nvbo->fence);
+	else
+		nouveau_fence_wait(&nvbo->wr_fence);
+
+	if (nvbo->sysmem)
+		bo->map = nvbo->sysmem;
+	else
+		bo->map = nvbo->map;
+	return 0;
+}
+
+void
+nouveau_bo_unmap(struct nouveau_bo *bo)
+{
+	bo->map = NULL;
+}
+
+static int
+nouveau_bo_upload(struct nouveau_bo_priv *nvbo)
+{
+	if (nvbo->fence)
+		nouveau_fence_wait(&nvbo->fence);
+	memcpy(nvbo->map, nvbo->sysmem, nvbo->drm.size);
+	return 0;
+}
+
+int
+nouveau_bo_set_status(struct nouveau_bo *bo, uint32_t flags)
+{
+	struct nouveau_bo_priv *nvbo = nouveau_bo(bo);
+	struct drm_nouveau_mem_alloc new;
+	void *new_map = NULL, *new_sysmem = NULL;
+	unsigned new_flags = 0, ret;
+
+	assert(!bo->map);
+
+	/* Check current memtype vs requested, if they match do nothing */
+	if ((nvbo->drm.flags & NOUVEAU_MEM_FB) && (flags & NOUVEAU_BO_VRAM))
+		return 0;
+	if ((nvbo->drm.flags & (NOUVEAU_MEM_AGP | NOUVEAU_MEM_PCI)) &&
+	    (flags & NOUVEAU_BO_GART))
+		return 0;
+	if (nvbo->drm.size == 0 && nvbo->sysmem && (flags & NOUVEAU_BO_LOCAL))
+		return 0;
+
+	memset(&new, 0x00, sizeof(new));
+
+	/* Allocate new memory */
+	if (flags & NOUVEAU_BO_VRAM)
+		new_flags |= NOUVEAU_MEM_FB;
+	else
+	if (flags & NOUVEAU_BO_GART)
+		new_flags |= (NOUVEAU_MEM_AGP | NOUVEAU_MEM_PCI);
+	
+	if (nvbo->tiled && flags) {
+		new_flags |= NOUVEAU_MEM_TILE;
+		if (nvbo->tiled & 2)
+			new_flags |= NOUVEAU_MEM_TILE_ZETA;
+	}
+
+	if (new_flags) {
+		ret = nouveau_mem_alloc(bo->device, bo->size,
+					nvbo->drm.alignment, new_flags,
+					&new, &new_map);
+		if (ret)
+			return ret;
+	} else
+	if (!nvbo->user) {
+		new_sysmem = malloc(bo->size);
+	}
+
+	/* Copy old -> new */
+	/*XXX: use M2MF */
+	if (nvbo->sysmem || nvbo->map) {
+		struct nouveau_pushbuf_bo *pbo = nvbo->pending;
+		nvbo->pending = NULL;
+		nouveau_bo_map(bo, NOUVEAU_BO_RD);
+		memcpy(new_map, bo->map, bo->size);
+		nouveau_bo_unmap(bo);
+		nvbo->pending = pbo;
+	}
+
+	/* Free old memory */
+	if (nvbo->fence)
+		nouveau_fence_wait(&nvbo->fence);
+	nouveau_mem_free(bo->device, &nvbo->drm, &nvbo->map);
+	if (nvbo->sysmem && !nvbo->user)
+		free(nvbo->sysmem);
+
+	nvbo->drm = new;
+	nvbo->map = new_map;
+	if (!nvbo->user)
+		nvbo->sysmem = new_sysmem;
+	bo->flags = flags;
+	bo->offset = nvbo->drm.offset;
+	return 0;
+}
+
+static int
+nouveau_bo_validate_user(struct nouveau_channel *chan, struct nouveau_bo *bo,
+			 struct nouveau_fence *fence, uint32_t flags)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
+	struct nouveau_device_priv *nvdev = nouveau_device(chan->device);
+	struct nouveau_bo_priv *nvbo = nouveau_bo(bo);
+	struct nouveau_resource *r;
+
+	if (nvchan->user_charge + bo->size > nvdev->sa.size)
+		return 1;
+
+	if (!(flags & NOUVEAU_BO_GART))
+		return 1;
+
+	r = nouveau_bo_tmp(chan, bo->size, fence);
+	if (!r)
+		return 1;
+	nvchan->user_charge += bo->size;
+
+	memcpy(nvdev->sa_map + r->start, nvbo->sysmem, bo->size);
+
+	nvbo->offset = nvdev->sa.offset + r->start;
+	nvbo->flags = NOUVEAU_BO_GART;
+	return 0;
+}
+
+static int
+nouveau_bo_validate_bo(struct nouveau_channel *chan, struct nouveau_bo *bo,
+		       struct nouveau_fence *fence, uint32_t flags)
+{
+	struct nouveau_bo_priv *nvbo = nouveau_bo(bo);
+	int ret;
+
+	ret = nouveau_bo_set_status(bo, flags);
+	if (ret) {
+		nouveau_fence_flush(chan);
+
+		ret = nouveau_bo_set_status(bo, flags);
+		if (ret)
+			return ret;
+	}
+
+	if (nvbo->user)
+		nouveau_bo_upload(nvbo);
+
+	nvbo->offset = nvbo->drm.offset;
+	if (nvbo->drm.flags & (NOUVEAU_MEM_AGP | NOUVEAU_MEM_PCI))
+		nvbo->flags = NOUVEAU_BO_GART;
+	else
+		nvbo->flags = NOUVEAU_BO_VRAM;
+
+	return 0;
+}
+
+int
+nouveau_bo_validate(struct nouveau_channel *chan, struct nouveau_bo *bo,
+		    uint32_t flags)
+{
+	struct nouveau_bo_priv *nvbo = nouveau_bo(bo);
+	struct nouveau_fence *fence = nouveau_pushbuf(chan->pushbuf)->fence;
+	int ret;
+
+	assert(bo->map == NULL);
+
+	if (nvbo->user) {
+		ret = nouveau_bo_validate_user(chan, bo, fence, flags);
+		if (ret) {
+			ret = nouveau_bo_validate_bo(chan, bo, fence, flags);
+			if (ret)
+				return ret;
+		}
+	} else {
+		ret = nouveau_bo_validate_bo(chan, bo, fence, flags);
+		if (ret)
+			return ret;
+	}
+
+	if (flags & NOUVEAU_BO_WR)
+		nouveau_fence_ref(fence, &nvbo->wr_fence);
+	nouveau_fence_ref(fence, &nvbo->fence);
+	return 0;
+}
+
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_channel.c b/src/gallium/winsys/drm/nouveau/nouveau_channel.c
new file mode 100644
index 00000000000..3b4dcd1ecf2
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_channel.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "nouveau_drmif.h"
+#include "nouveau_dma.h"
+
+int
+nouveau_channel_alloc(struct nouveau_device *dev, uint32_t fb_ctxdma,
+		      uint32_t tt_ctxdma, struct nouveau_channel **chan)
+{
+	struct nouveau_device_priv *nvdev = nouveau_device(dev);
+	struct nouveau_channel_priv *nvchan;
+	int ret;
+
+	if (!nvdev || !chan || *chan)
+	    return -EINVAL;
+
+	nvchan = calloc(1, sizeof(struct nouveau_channel_priv));
+	if (!nvchan)
+		return -ENOMEM;
+	nvchan->base.device = dev;
+
+	nvchan->drm.fb_ctxdma_handle = fb_ctxdma;
+	nvchan->drm.tt_ctxdma_handle = tt_ctxdma;
+	ret = drmCommandWriteRead(nvdev->fd, DRM_NOUVEAU_CHANNEL_ALLOC,
+				  &nvchan->drm, sizeof(nvchan->drm));
+	if (ret) {
+		free(nvchan);
+		return ret;
+	}
+
+	nvchan->base.id = nvchan->drm.channel;
+	if (nouveau_grobj_ref(&nvchan->base, nvchan->drm.fb_ctxdma_handle,
+			      &nvchan->base.vram) ||
+	    nouveau_grobj_ref(&nvchan->base, nvchan->drm.tt_ctxdma_handle,
+		    	      &nvchan->base.gart)) {
+		nouveau_channel_free((void *)&nvchan);
+		return -EINVAL;
+	}
+
+	ret = drmMap(nvdev->fd, nvchan->drm.ctrl, nvchan->drm.ctrl_size,
+		     (void*)&nvchan->user);
+	if (ret) {
+		nouveau_channel_free((void *)&nvchan);
+		return ret;
+	}
+	nvchan->put     = &nvchan->user[0x40/4];
+	nvchan->get     = &nvchan->user[0x44/4];
+	nvchan->ref_cnt = &nvchan->user[0x48/4];
+
+	ret = drmMap(nvdev->fd, nvchan->drm.notifier, nvchan->drm.notifier_size,
+		     (drmAddressPtr)&nvchan->notifier_block);
+	if (ret) {
+		nouveau_channel_free((void *)&nvchan);
+		return ret;
+	}
+
+	ret = drmMap(nvdev->fd, nvchan->drm.cmdbuf, nvchan->drm.cmdbuf_size,
+		     (void*)&nvchan->pushbuf);
+	if (ret) {
+		nouveau_channel_free((void *)&nvchan);
+		return ret;
+	}
+
+	ret = nouveau_grobj_alloc(&nvchan->base, 0x00000000, 0x0030,
+				  &nvchan->base.nullobj);
+	if (ret) {
+		nouveau_channel_free((void *)&nvchan);
+		return ret;
+	}
+
+	nouveau_dma_channel_init(&nvchan->base);
+	nouveau_pushbuf_init(&nvchan->base);
+
+	*chan = &nvchan->base;
+	return 0;
+}
+
+void
+nouveau_channel_free(struct nouveau_channel **chan)
+{
+	struct nouveau_channel_priv *nvchan;
+	struct nouveau_device_priv *nvdev;
+	struct drm_nouveau_channel_free cf;
+
+	if (!chan || !*chan)
+		return;
+	nvchan = nouveau_channel(*chan);
+	*chan = NULL;
+	nvdev = nouveau_device(nvchan->base.device);
+	
+	FIRE_RING_CH(&nvchan->base);
+
+	nouveau_grobj_free(&nvchan->base.vram);
+	nouveau_grobj_free(&nvchan->base.gart);
+	nouveau_grobj_free(&nvchan->base.nullobj);
+
+	cf.channel = nvchan->drm.channel;
+	drmCommandWrite(nvdev->fd, DRM_NOUVEAU_CHANNEL_FREE, &cf, sizeof(cf));
+	free(nvchan);
+}
+
+
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_context.c b/src/gallium/winsys/drm/nouveau/nouveau_context.c
new file mode 100644
index 00000000000..74413c408f3
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_context.c
@@ -0,0 +1,346 @@
+#include "main/glheader.h"
+#include "glapi/glthread.h"
+#include <GL/internal/glcore.h>
+#include "utils.h"
+
+#include "state_tracker/st_public.h"
+#include "state_tracker/st_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+
+#include "nouveau_context.h"
+#include "nouveau_dri.h"
+#include "nouveau_local.h"
+#include "nouveau_screen.h"
+#include "nouveau_winsys_pipe.h"
+
+#ifdef DEBUG
+static const struct dri_debug_control debug_control[] = {
+	{ "bo", DEBUG_BO },
+	{ NULL, 0 }
+};
+int __nouveau_debug = 0;
+#endif
+
+static void
+nouveau_channel_context_destroy(struct nouveau_channel_context *nvc)
+{
+	nouveau_grobj_free(&nvc->NvCtxSurf2D);
+	nouveau_grobj_free(&nvc->NvImageBlit);
+	nouveau_grobj_free(&nvc->NvGdiRect);
+	nouveau_grobj_free(&nvc->NvM2MF);
+	nouveau_grobj_free(&nvc->Nv2D);
+	nouveau_grobj_free(&nvc->NvSwzSurf);
+	nouveau_grobj_free(&nvc->NvSIFM);
+
+	nouveau_notifier_free(&nvc->sync_notifier);
+
+	nouveau_channel_free(&nvc->channel);
+
+	FREE(nvc);
+}
+
+static struct nouveau_channel_context *
+nouveau_channel_context_create(struct nouveau_device *dev)
+{
+	struct nouveau_channel_context *nvc;
+	int ret;
+
+	nvc = CALLOC_STRUCT(nouveau_channel_context);
+	if (!nvc)
+		return NULL;
+
+	if ((ret = nouveau_channel_alloc(dev, 0x8003d001, 0x8003d002,
+					 &nvc->channel))) {
+		NOUVEAU_ERR("Error creating GPU channel: %d\n", ret);
+		nouveau_channel_context_destroy(nvc);
+		return NULL;
+	}
+
+	nvc->next_handle = 0x80000000;
+
+	if ((ret = nouveau_notifier_alloc(nvc->channel, nvc->next_handle++, 1,
+					  &nvc->sync_notifier))) {
+		NOUVEAU_ERR("Error creating channel sync notifier: %d\n", ret);
+		nouveau_channel_context_destroy(nvc);
+		return NULL;
+	}
+
+	switch (dev->chipset & 0xf0) {
+	case 0x50:
+	case 0x80:
+	case 0x90:
+		ret = nouveau_surface_channel_create_nv50(nvc);
+		break;
+	default:
+		ret = nouveau_surface_channel_create_nv04(nvc);
+		break;
+	}
+
+	if (ret) {
+		NOUVEAU_ERR("Error initialising surface objects: %d\n", ret);
+		nouveau_channel_context_destroy(nvc);
+		return NULL;
+	}
+
+	return nvc;
+}
+
+GLboolean
+nouveau_context_create(const __GLcontextModes *glVis,
+		       __DRIcontextPrivate *driContextPriv,
+		       void *sharedContextPrivate)
+{
+	__DRIscreenPrivate *driScrnPriv = driContextPriv->driScreenPriv;
+	struct nouveau_screen  *nv_screen = driScrnPriv->private;
+	struct nouveau_context *nv = CALLOC_STRUCT(nouveau_context);
+	struct pipe_context *pipe = NULL;
+	struct st_context *st_share = NULL;
+	struct nouveau_channel_context *nvc = NULL;
+	struct nouveau_device *dev = nv_screen->device;
+	int i;
+
+	if (sharedContextPrivate) {
+		st_share = ((struct nouveau_context *)sharedContextPrivate)->st;
+	}
+
+	switch (dev->chipset & 0xf0) {
+	case 0x10:
+	case 0x20:
+		/* NV10 */
+	case 0x30:
+		/* NV30 */
+	case 0x40:
+	case 0x60:
+		/* NV40 */
+	case 0x50:
+	case 0x80:
+	case 0x90:
+		/* G80 */
+		break;
+	default:
+		NOUVEAU_ERR("Unsupported chipset: NV%02x\n", dev->chipset);
+		return GL_FALSE;
+	}
+
+	driContextPriv->driverPrivate = (void *)nv;
+	nv->nv_screen  = nv_screen;
+	nv->dri_screen = driScrnPriv;
+
+	{
+		struct nouveau_device_priv *nvdev = nouveau_device(dev);
+
+		nvdev->ctx  = driContextPriv->hHWContext;
+		nvdev->lock = (drmLock *)&driScrnPriv->pSAREA->lock;
+	}
+
+	driParseConfigFiles(&nv->dri_option_cache, &nv_screen->option_cache,
+			    nv->dri_screen->myNum, "nouveau");
+#ifdef DEBUG
+	__nouveau_debug = driParseDebugString(getenv("NOUVEAU_DEBUG"),
+					      debug_control);
+#endif
+
+	/*XXX: Hack up a fake region and buffer object for front buffer.
+	 *     This will go away with TTM, replaced with a simple reference
+	 *     of the front buffer handle passed to us by the DDX.
+	 */
+	{
+		struct pipe_surface *fb_surf;
+		struct nouveau_pipe_buffer *fb_buf;
+		struct nouveau_bo_priv *fb_bo;
+
+		fb_bo = calloc(1, sizeof(struct nouveau_bo_priv));
+		fb_bo->drm.offset = nv_screen->front_offset;
+		fb_bo->drm.flags = NOUVEAU_MEM_FB;
+		fb_bo->drm.size = nv_screen->front_pitch * 
+				  nv_screen->front_height;
+		fb_bo->refcount = 1;
+		fb_bo->base.flags = NOUVEAU_BO_PIN | NOUVEAU_BO_VRAM;
+		fb_bo->base.offset = fb_bo->drm.offset;
+		fb_bo->base.handle = (unsigned long)fb_bo;
+		fb_bo->base.size = fb_bo->drm.size;
+		fb_bo->base.device = nv_screen->device;
+
+		fb_buf = calloc(1, sizeof(struct nouveau_pipe_buffer));
+		fb_buf->bo = &fb_bo->base;
+
+		fb_surf = calloc(1, sizeof(struct pipe_surface));
+		if (nv_screen->front_cpp == 2)
+			fb_surf->format = PIPE_FORMAT_R5G6B5_UNORM;
+		else
+			fb_surf->format = PIPE_FORMAT_A8R8G8B8_UNORM;
+		pf_get_block(fb_surf->format, &fb_surf->block);
+		fb_surf->width = nv_screen->front_pitch / nv_screen->front_cpp;
+		fb_surf->height = nv_screen->front_height;
+		fb_surf->stride = fb_surf->width * fb_surf->block.size;
+		fb_surf->refcount = 1;
+		fb_surf->buffer = &fb_buf->base;
+
+		nv->frontbuffer = fb_surf;
+	}
+
+	/* Attempt to share a single channel between multiple contexts from
+	 * a single process.
+	 */
+	nvc = nv_screen->nvc;
+	if (!nvc && st_share) {
+		struct nouveau_context *snv = st_share->pipe->priv;
+		if (snv) {
+			nvc = snv->nvc;
+		}
+	}
+
+	/*XXX: temporary - disable multi-context/single-channel on pre-NV4x */
+	switch (dev->chipset & 0xf0) {
+	case 0x40:
+	case 0x60:
+		/* NV40 class */
+	case 0x50:
+	case 0x80:
+	case 0x90:
+		/* G80 class */
+		break;
+	default:
+		nvc = NULL;
+		break;
+	}
+
+	if (!nvc) {
+		nvc = nouveau_channel_context_create(dev);
+		if (!nvc) {
+			NOUVEAU_ERR("Failed initialising GPU context\n");
+			return GL_FALSE;
+		}
+		nv_screen->nvc = nvc;
+	}
+
+	nvc->refcount++;
+	nv->nvc = nvc;
+
+	/* Find a free slot for a pipe context, allocate a new one if needed */
+	nv->pctx_id = -1;
+	for (i = 0; i < nvc->nr_pctx; i++) {
+		if (nvc->pctx[i] == NULL) {
+			nv->pctx_id = i;
+			break;
+		}
+	}
+
+	if (nv->pctx_id < 0) {
+		nv->pctx_id = nvc->nr_pctx++;
+		nvc->pctx =
+			realloc(nvc->pctx,
+				sizeof(struct pipe_context *) * nvc->nr_pctx);
+	}
+
+	/* Create pipe */
+	switch (dev->chipset & 0xf0) {
+	case 0x50:
+	case 0x80:
+	case 0x90:
+		if (nouveau_surface_init_nv50(nv))
+			return GL_FALSE;
+		break;
+	default:
+		if (nouveau_surface_init_nv04(nv))
+			return GL_FALSE;
+		break;
+	}
+
+	if (!getenv("NOUVEAU_FORCE_SOFTPIPE")) {
+		struct pipe_screen *pscreen;
+
+		pipe = nouveau_pipe_create(nv);
+		if (!pipe)
+			NOUVEAU_ERR("Couldn't create hw pipe\n");
+		pscreen = nvc->pscreen;
+
+		nv->cap.hw_vertex_buffer =
+			pscreen->get_param(pscreen, NOUVEAU_CAP_HW_VTXBUF);
+		nv->cap.hw_index_buffer =
+			pscreen->get_param(pscreen, NOUVEAU_CAP_HW_IDXBUF);
+	}
+
+	if (!pipe) {
+		NOUVEAU_MSG("Using softpipe\n");
+		pipe = nouveau_create_softpipe(nv);
+		if (!pipe) {
+			NOUVEAU_ERR("Error creating pipe, bailing\n");
+			return GL_FALSE;
+		}
+	}
+
+	pipe->priv = nv;
+	nv->st = st_create_context(pipe, glVis, st_share);
+	return GL_TRUE;
+}
+
+void
+nouveau_context_destroy(__DRIcontextPrivate *driContextPriv)
+{
+	struct nouveau_context *nv = driContextPriv->driverPrivate;
+	struct nouveau_channel_context *nvc = nv->nvc;
+
+	assert(nv);
+
+	st_finish(nv->st);
+	st_destroy_context(nv->st);
+
+	if (nv->pctx_id >= 0) {
+		nvc->pctx[nv->pctx_id] = NULL;
+		if (--nvc->refcount <= 0) {
+			nouveau_channel_context_destroy(nvc);
+			nv->nv_screen->nvc = NULL;
+		}
+	}
+
+	free(nv);
+}
+
+GLboolean
+nouveau_context_bind(__DRIcontextPrivate *driContextPriv,
+		     __DRIdrawablePrivate *driDrawPriv,
+		     __DRIdrawablePrivate *driReadPriv)
+{
+	struct nouveau_context *nv;
+	struct nouveau_framebuffer *draw, *read;
+
+	if (!driContextPriv) {
+		st_make_current(NULL, NULL, NULL);
+		return GL_TRUE;
+	}
+
+	nv = driContextPriv->driverPrivate;
+	draw = driDrawPriv->driverPrivate;
+	read = driReadPriv->driverPrivate;
+
+	st_make_current(nv->st, draw->stfb, read->stfb);
+
+	if ((nv->dri_drawable != driDrawPriv) ||
+	    (nv->last_stamp != driDrawPriv->lastStamp)) {
+		nv->dri_drawable = driDrawPriv;
+		st_resize_framebuffer(draw->stfb, driDrawPriv->w,
+				      driDrawPriv->h);
+		nv->last_stamp = driDrawPriv->lastStamp;
+	}
+
+	if (driDrawPriv != driReadPriv) {
+		st_resize_framebuffer(read->stfb, driReadPriv->w,
+				      driReadPriv->h);
+	}
+
+	return GL_TRUE;
+}
+
+GLboolean
+nouveau_context_unbind(__DRIcontextPrivate *driContextPriv)
+{
+	struct nouveau_context *nv = driContextPriv->driverPrivate;
+	(void)nv;
+
+	st_flush(nv->st, 0, NULL);
+	return GL_TRUE;
+}
+
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_context.h b/src/gallium/winsys/drm/nouveau/nouveau_context.h
new file mode 100644
index 00000000000..77e2147a2c7
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_context.h
@@ -0,0 +1,113 @@
+#ifndef __NOUVEAU_CONTEXT_H__
+#define __NOUVEAU_CONTEXT_H__
+
+#include "dri_util.h"
+#include "xmlconfig.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau_drmif.h"
+#include "nouveau_dma.h"
+
+struct nouveau_framebuffer {
+	struct st_framebuffer *stfb;
+};
+
+struct nouveau_channel_context {
+	struct pipe_screen *pscreen;
+	int refcount;
+
+	unsigned cur_pctx;
+	unsigned nr_pctx;
+	struct pipe_context **pctx;
+
+	struct nouveau_channel  *channel;
+
+	struct nouveau_notifier *sync_notifier;
+
+	/* Common */
+	struct nouveau_grobj    *NvM2MF;
+	/* NV04-NV40 */
+	struct nouveau_grobj    *NvCtxSurf2D;
+	struct nouveau_grobj	*NvSwzSurf;
+	struct nouveau_grobj    *NvImageBlit;
+	struct nouveau_grobj    *NvGdiRect;
+	struct nouveau_grobj	*NvSIFM;
+	/* G80 */
+	struct nouveau_grobj    *Nv2D;
+
+	uint32_t                 next_handle;
+	uint32_t                 next_subchannel;
+	uint32_t                 next_sequence;
+};
+
+struct nouveau_context {
+	struct st_context *st;
+
+	/* DRI stuff */
+	__DRIscreenPrivate    *dri_screen;
+	__DRIdrawablePrivate  *dri_drawable;
+	unsigned int           last_stamp;
+	driOptionCache         dri_option_cache;
+	drm_context_t          drm_context;
+	drmLock                drm_lock;
+	GLboolean              locked;
+	struct nouveau_screen *nv_screen;
+	struct pipe_surface *frontbuffer;
+
+	struct {
+		int hw_vertex_buffer;
+		int hw_index_buffer;
+	} cap;
+
+	/* Hardware context */
+	struct nouveau_channel_context *nvc;
+	int pctx_id;
+
+	/* pipe_surface accel */
+	struct pipe_surface *surf_src, *surf_dst;
+	unsigned surf_src_offset, surf_dst_offset;
+	int  (*surface_copy_prep)(struct nouveau_context *,
+				  struct pipe_surface *dst,
+				  struct pipe_surface *src);
+	void (*surface_copy)(struct nouveau_context *, unsigned dx, unsigned dy,
+			     unsigned sx, unsigned sy, unsigned w, unsigned h);
+	void (*surface_copy_done)(struct nouveau_context *);
+	int (*surface_fill)(struct nouveau_context *, struct pipe_surface *,
+			    unsigned, unsigned, unsigned, unsigned, unsigned);
+};
+
+extern GLboolean nouveau_context_create(const __GLcontextModes *,
+					__DRIcontextPrivate *, void *);
+extern void nouveau_context_destroy(__DRIcontextPrivate *);
+extern GLboolean nouveau_context_bind(__DRIcontextPrivate *,
+				      __DRIdrawablePrivate *draw,
+				      __DRIdrawablePrivate *read);
+extern GLboolean nouveau_context_unbind(__DRIcontextPrivate *);
+
+#ifdef DEBUG
+extern int __nouveau_debug;
+
+#define DEBUG_BO (1 << 0)
+
+#define DBG(flag, ...) do {                   \
+	if (__nouveau_debug & (DEBUG_##flag)) \
+		NOUVEAU_ERR(__VA_ARGS__);     \
+} while(0)
+#else
+#define DBG(flag, ...)
+#endif
+
+extern void LOCK_HARDWARE(struct nouveau_context *);
+extern void UNLOCK_HARDWARE(struct nouveau_context *);
+
+extern int
+nouveau_surface_channel_create_nv04(struct nouveau_channel_context *);
+extern int
+nouveau_surface_channel_create_nv50(struct nouveau_channel_context *);
+extern int nouveau_surface_init_nv04(struct nouveau_context *);
+extern int nouveau_surface_init_nv50(struct nouveau_context *);
+
+extern uint32_t *nouveau_pipe_dma_beginp(struct nouveau_grobj *, int, int);
+extern void nouveau_pipe_dma_kickoff(struct nouveau_channel *);
+
+#endif
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_device.c b/src/gallium/winsys/drm/nouveau/nouveau_device.c
new file mode 100644
index 00000000000..0b452fcd02d
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_device.c
@@ -0,0 +1,159 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+
+#include "nouveau_drmif.h"
+
+int
+nouveau_device_open_existing(struct nouveau_device **dev, int close,
+			     int fd, drm_context_t ctx)
+{
+	struct nouveau_device_priv *nvdev;
+	int ret;
+
+	if (!dev || *dev)
+	    return -EINVAL;
+
+	nvdev = calloc(1, sizeof(*nvdev));
+	if (!nvdev)
+	    return -ENOMEM;
+	nvdev->fd = fd;
+	nvdev->ctx = ctx;
+	nvdev->needs_close = close;
+
+	drmCommandNone(nvdev->fd, DRM_NOUVEAU_CARD_INIT);
+
+	if ((ret = nouveau_bo_init(&nvdev->base))) {
+		nouveau_device_close((void *)&nvdev);
+		return ret;
+	}
+
+	{
+		uint64_t value;
+
+		ret = nouveau_device_get_param(&nvdev->base,
+					       NOUVEAU_GETPARAM_CHIPSET_ID,
+					       &value);
+		if (ret) {
+			nouveau_device_close((void *)&nvdev);
+			return ret;
+		}
+		nvdev->base.chipset = value;
+	}
+
+	*dev = &nvdev->base;
+	return 0;
+}
+
+int
+nouveau_device_open(struct nouveau_device **dev, const char *busid)
+{
+	drm_context_t ctx;
+	int fd, ret;
+
+	if (!dev || *dev)
+		return -EINVAL;
+
+	fd = drmOpen("nouveau", busid);
+	if (fd < 0)
+		return -EINVAL;
+
+	ret = drmCreateContext(fd, &ctx);
+	if (ret) {
+		drmClose(fd);
+		return ret;
+	}
+
+	ret = nouveau_device_open_existing(dev, 1, fd, ctx);
+	if (ret) {
+	    drmDestroyContext(fd, ctx);
+	    drmClose(fd);
+	    return ret;
+	}
+
+	return 0;
+}
+
+void
+nouveau_device_close(struct nouveau_device **dev)
+{
+	struct nouveau_device_priv *nvdev;
+
+	if (dev || !*dev)
+		return;
+	nvdev = nouveau_device(*dev);
+	*dev = NULL;
+
+	nouveau_bo_takedown(&nvdev->base);
+
+	if (nvdev->needs_close) {
+		drmDestroyContext(nvdev->fd, nvdev->ctx);
+		drmClose(nvdev->fd);
+	}
+	free(nvdev);
+}
+
+int
+nouveau_device_get_param(struct nouveau_device *dev,
+			 uint64_t param, uint64_t *value)
+{
+	struct nouveau_device_priv *nvdev = nouveau_device(dev);
+	struct drm_nouveau_getparam g;
+	int ret;
+
+	if (!nvdev || !value)
+		return -EINVAL;
+
+	g.param = param;
+	ret = drmCommandWriteRead(nvdev->fd, DRM_NOUVEAU_GETPARAM,
+				  &g, sizeof(g));
+	if (ret)
+		return ret;
+
+	*value = g.value;
+	return 0;
+}
+
+int
+nouveau_device_set_param(struct nouveau_device *dev,
+			 uint64_t param, uint64_t value)
+{
+	struct nouveau_device_priv *nvdev = nouveau_device(dev);
+	struct drm_nouveau_setparam s;
+	int ret;
+
+	if (!nvdev)
+		return -EINVAL;
+
+	s.param = param;
+	s.value = value;
+	ret = drmCommandWriteRead(nvdev->fd, DRM_NOUVEAU_SETPARAM,
+				  &s, sizeof(s));
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_dma.c b/src/gallium/winsys/drm/nouveau/nouveau_dma.c
new file mode 100644
index 00000000000..f8a8ba04f6d
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_dma.c
@@ -0,0 +1,219 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdint.h>
+#include <assert.h>
+#include <errno.h>
+
+#include "nouveau_drmif.h"
+#include "nouveau_dma.h"
+#include "nouveau_local.h"
+
+static inline uint32_t
+READ_GET(struct nouveau_channel_priv *nvchan)
+{
+	return *nvchan->get;
+}
+
+static inline void
+WRITE_PUT(struct nouveau_channel_priv *nvchan, uint32_t val)
+{
+	uint32_t put = ((val << 2) + nvchan->dma->base);
+	volatile int dum;
+
+	NOUVEAU_DMA_BARRIER;
+	dum = READ_GET(nvchan);
+
+	*nvchan->put = put;
+	nvchan->dma->put = val;
+#ifdef NOUVEAU_DMA_TRACE
+	NOUVEAU_MSG("WRITE_PUT %d/0x%08x\n", nvchan->drm.channel, put);
+#endif
+
+	NOUVEAU_DMA_BARRIER;
+}
+
+static inline int
+LOCAL_GET(struct nouveau_dma_priv *dma, uint32_t *val)
+{
+	uint32_t get = *val;
+
+	if (get >= dma->base && get <= (dma->base + (dma->max << 2))) {
+		*val = (get - dma->base) >> 2;
+		return 1;
+	}
+
+	return 0;
+}
+
+void
+nouveau_dma_channel_init(struct nouveau_channel *chan)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
+	int i;
+
+	nvchan->dma = &nvchan->dma_master;
+	nvchan->dma->base = nvchan->drm.put_base;
+	nvchan->dma->cur  = nvchan->dma->put = 0;
+	nvchan->dma->max  = (nvchan->drm.cmdbuf_size >> 2) - 2;
+	nvchan->dma->free = nvchan->dma->max - nvchan->dma->cur;
+
+	RING_SPACE_CH(chan, RING_SKIPS);
+	for (i = 0; i < RING_SKIPS; i++)
+		OUT_RING_CH(chan, 0);
+}
+
+#define CHECK_TIMEOUT() do {                                                   \
+	if ((NOUVEAU_TIME_MSEC() - t_start) > NOUVEAU_DMA_TIMEOUT)             \
+		return - EBUSY;                                                \
+} while(0)
+
+int
+nouveau_dma_wait(struct nouveau_channel *chan, int size)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
+	struct nouveau_dma_priv *dma = nvchan->dma;
+	uint32_t get, t_start;
+
+	FIRE_RING_CH(chan);
+
+	t_start = NOUVEAU_TIME_MSEC();
+	while (dma->free < size) {
+		CHECK_TIMEOUT();
+
+		get = READ_GET(nvchan);
+		if (!LOCAL_GET(dma, &get))
+			continue;
+
+		if (dma->put >= get) {
+			dma->free = dma->max - dma->cur;
+
+			if (dma->free < size) {
+#ifdef NOUVEAU_DMA_DEBUG
+				dma->push_free = 1;
+#endif
+				OUT_RING_CH(chan, 0x20000000 | dma->base);
+				if (get <= RING_SKIPS) {
+					/*corner case - will be idle*/
+					if (dma->put <= RING_SKIPS)
+						WRITE_PUT(nvchan,
+							  RING_SKIPS + 1);
+
+					do {
+						CHECK_TIMEOUT();
+						get = READ_GET(nvchan);
+						if (!LOCAL_GET(dma, &get))
+							get = 0;
+					} while (get <= RING_SKIPS);
+				}
+
+				WRITE_PUT(nvchan, RING_SKIPS);
+				dma->cur  = dma->put = RING_SKIPS;
+				dma->free = get - (RING_SKIPS + 1);
+			}
+		} else {
+			dma->free = get - dma->cur - 1;
+		}
+	}
+
+	return 0;
+}
+
+#ifdef NOUVEAU_DMA_DUMP_POSTRELOC_PUSHBUF
+static void
+nouveau_dma_parse_pushbuf(struct nouveau_channel *chan, int get, int put)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
+	unsigned mthd_count = 0;
+	
+	while (get != put) {
+		uint32_t gpuget = (get << 2) + nvchan->drm.put_base;
+		uint32_t data;
+
+		if (get < 0 || get >= nvchan->drm.cmdbuf_size) {
+			NOUVEAU_ERR("DMA_PT 0x%08x\n", gpuget);
+			assert(0);
+		}
+		data = nvchan->pushbuf[get++];
+
+		if (mthd_count) {
+			NOUVEAU_MSG("0x%08x 0x%08x\n", gpuget, data);
+			mthd_count--;
+			continue;
+		}
+
+		switch (data & 0x60000000) {
+		case 0x00000000:
+			mthd_count = (data >> 18) & 0x7ff;
+			NOUVEAU_MSG("0x%08x 0x%08x MTHD "
+				    "Sc %d Mthd 0x%04x Size %d\n",
+				    gpuget, data, (data>>13) & 7, data & 0x1ffc,
+				    mthd_count);
+			break;
+		case 0x20000000:
+			get = (data & 0x1ffffffc) >> 2;
+			NOUVEAU_MSG("0x%08x 0x%08x JUMP 0x%08x\n",
+				    gpuget, data, data & 0x1ffffffc);
+			continue;
+		case 0x40000000:
+			mthd_count = (data >> 18) & 0x7ff;
+			NOUVEAU_MSG("0x%08x 0x%08x NINC "
+				    "Sc %d Mthd 0x%04x Size %d\n",
+				    gpuget, data, (data>>13) & 7, data & 0x1ffc,
+				    mthd_count);
+			break;
+		case 0x60000000:
+			/* DMA_OPCODE_CALL apparently, doesn't seem to work on
+			 * my NV40 at least..
+			 */
+			/* fall-through */
+		default:
+			NOUVEAU_MSG("DMA_PUSHER 0x%08x 0x%08x\n",
+				    gpuget, data);
+			assert(0);
+		}
+	}
+}
+#endif
+
+void
+nouveau_dma_kickoff(struct nouveau_channel *chan)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
+	struct nouveau_dma_priv *dma = nvchan->dma;
+
+	if (dma->cur == dma->put)
+		return;
+
+#ifdef NOUVEAU_DMA_DEBUG
+	if (dma->push_free) {
+		NOUVEAU_ERR("Packet incomplete: %d left\n", dma->push_free);
+		return;
+	}
+#endif
+
+#ifdef NOUVEAU_DMA_DUMP_POSTRELOC_PUSHBUF
+	nouveau_dma_parse_pushbuf(chan, dma->put, dma->cur);
+#endif
+
+	WRITE_PUT(nvchan, dma->cur);
+}
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_dma.h b/src/gallium/winsys/drm/nouveau/nouveau_dma.h
new file mode 100644
index 00000000000..cfa6d26e828
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_dma.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __NOUVEAU_DMA_H__
+#define __NOUVEAU_DMA_H__
+
+#include <string.h>
+#include "nouveau_drmif.h"
+#include "nouveau_local.h"
+
+#define RING_SKIPS 8
+
+extern int  nouveau_dma_wait(struct nouveau_channel *chan, int size);
+extern void nouveau_dma_subc_bind(struct nouveau_grobj *);
+extern void nouveau_dma_channel_init(struct nouveau_channel *);
+extern void nouveau_dma_kickoff(struct nouveau_channel *);
+
+#ifdef NOUVEAU_DMA_DEBUG
+static char faulty[1024];
+#endif
+
+static inline void
+nouveau_dma_out(struct nouveau_channel *chan, uint32_t data)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
+	struct nouveau_dma_priv *dma = nvchan->dma;
+
+#ifdef NOUVEAU_DMA_DEBUG
+	if (dma->push_free == 0) {
+		NOUVEAU_ERR("No space left in packet at %s\n", faulty);
+		return;
+	}
+	dma->push_free--;
+#endif
+#ifdef NOUVEAU_DMA_TRACE
+	{
+		uint32_t offset = (dma->cur << 2) + dma->base;
+		NOUVEAU_MSG("\tOUT_RING %d/0x%08x -> 0x%08x\n",
+			    nvchan->drm.channel, offset, data);
+	}
+#endif
+	nvchan->pushbuf[dma->cur + (dma->base - nvchan->drm.put_base)/4] = data;
+	dma->cur++;
+}
+
+static inline void
+nouveau_dma_outp(struct nouveau_channel *chan, uint32_t *ptr, int size)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
+	struct nouveau_dma_priv *dma = nvchan->dma;
+	(void)dma;
+
+#ifdef NOUVEAU_DMA_DEBUG
+	if (dma->push_free < size) {
+		NOUVEAU_ERR("Packet too small.  Free=%d, Need=%d\n",
+			    dma->push_free, size);
+		return;
+	}
+#endif
+#ifdef NOUVEAU_DMA_TRACE
+	while (size--) {
+		nouveau_dma_out(chan, *ptr);
+		ptr++;
+	}
+#else
+	memcpy(&nvchan->pushbuf[dma->cur], ptr, size << 2);
+#ifdef NOUVEAU_DMA_DEBUG
+	dma->push_free -= size;
+#endif
+	dma->cur += size;
+#endif
+}
+
+static inline void
+nouveau_dma_space(struct nouveau_channel *chan, int size)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
+	struct nouveau_dma_priv *dma = nvchan->dma;
+
+	if (dma->free < size) {
+		if (nouveau_dma_wait(chan, size) && chan->hang_notify)
+			chan->hang_notify(chan);
+	}
+	dma->free -= size;
+#ifdef NOUVEAU_DMA_DEBUG
+	dma->push_free = size;
+#endif
+}
+
+static inline void
+nouveau_dma_begin(struct nouveau_channel *chan, struct nouveau_grobj *grobj,
+		  int method, int size, const char* file, int line)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
+	struct nouveau_dma_priv *dma = nvchan->dma;
+	(void)dma;
+
+#ifdef NOUVEAU_DMA_TRACE
+	NOUVEAU_MSG("BEGIN_RING %d/%08x/%d/0x%04x/%d\n", nvchan->drm.channel,
+		    grobj->handle, grobj->subc, method, size);
+#endif
+
+#ifdef NOUVEAU_DMA_DEBUG
+	if (dma->push_free) {
+		NOUVEAU_ERR("Previous packet incomplete: %d left at %s\n",
+			    dma->push_free, faulty);
+		return;
+	}
+	sprintf(faulty,"%s:%d",file,line);
+#endif
+
+	nouveau_dma_space(chan, (size + 1));
+	nouveau_dma_out(chan, (size << 18) | (grobj->subc << 13) | method);
+}
+
+#define RING_SPACE_CH(ch,sz)         nouveau_dma_space((ch), (sz))
+#define BEGIN_RING_CH(ch,gr,m,sz)    nouveau_dma_begin((ch), (gr), (m), (sz), __FUNCTION__, __LINE__ )
+#define OUT_RING_CH(ch, data)        nouveau_dma_out((ch), (data))
+#define OUT_RINGp_CH(ch,ptr,dwords)  nouveau_dma_outp((ch), (void*)(ptr),      \
+						      (dwords))
+#define FIRE_RING_CH(ch)             nouveau_dma_kickoff((ch))
+#define WAIT_RING_CH(ch,sz)          nouveau_dma_wait((ch), (sz))
+		
+#endif
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_dri.h b/src/gallium/winsys/drm/nouveau/nouveau_dri.h
new file mode 100644
index 00000000000..1207c2d609c
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_dri.h
@@ -0,0 +1,28 @@
+#ifndef _NOUVEAU_DRI_
+#define _NOUVEAU_DRI_
+
+#include "xf86drm.h"
+#include "drm.h"
+#include "nouveau_drm.h"
+
+struct nouveau_dri {
+	uint32_t device_id;	/**< \brief PCI device ID */
+	uint32_t width;		/**< \brief width in pixels of display */
+	uint32_t height;	/**< \brief height in scanlines of display */
+	uint32_t depth;		/**< \brief depth of display (8, 15, 16, 24) */
+	uint32_t bpp;		/**< \brief bit depth of display (8, 16, 24, 32) */
+
+	uint32_t bus_type;	/**< \brief ths bus type */
+	uint32_t bus_mode;	/**< \brief bus mode (used for AGP, maybe also for PCI-E ?) */
+
+	uint32_t front_offset;	/**< \brief front buffer offset */
+	uint32_t front_pitch;	/**< \brief front buffer pitch */
+	uint32_t back_offset;	/**< \brief private back buffer offset */
+	uint32_t back_pitch;	/**< \brief private back buffer pitch */
+	uint32_t depth_offset;	/**< \brief private depth buffer offset */
+	uint32_t depth_pitch;	/**< \brief private depth buffer pitch */
+
+};
+
+#endif
+
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_drmif.h b/src/gallium/winsys/drm/nouveau/nouveau_drmif.h
new file mode 100644
index 00000000000..dcd6a5eb0a4
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_drmif.h
@@ -0,0 +1,310 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __NOUVEAU_DRMIF_H__
+#define __NOUVEAU_DRMIF_H__
+
+#include <stdint.h>
+#include <xf86drm.h>
+#include <nouveau_drm.h>
+
+#include "nouveau/nouveau_device.h"
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_grobj.h"
+#include "nouveau/nouveau_notifier.h"
+#include "nouveau/nouveau_bo.h"
+#include "nouveau/nouveau_resource.h"
+#include "nouveau/nouveau_pushbuf.h"
+
+struct nouveau_device_priv {
+	struct nouveau_device base;
+
+	int fd;
+	drm_context_t ctx;
+	drmLock *lock;
+	int needs_close;
+
+	struct drm_nouveau_mem_alloc sa;
+	void *sa_map;
+	struct nouveau_resource *sa_heap;
+};
+#define nouveau_device(n) ((struct nouveau_device_priv *)(n))
+
+extern int
+nouveau_device_open_existing(struct nouveau_device **, int close,
+			     int fd, drm_context_t ctx);
+
+extern int
+nouveau_device_open(struct nouveau_device **, const char *busid);
+
+extern void
+nouveau_device_close(struct nouveau_device **);
+
+extern int
+nouveau_device_get_param(struct nouveau_device *, uint64_t param, uint64_t *v);
+
+extern int
+nouveau_device_set_param(struct nouveau_device *, uint64_t param, uint64_t val);
+
+struct nouveau_fence {
+	struct nouveau_channel *channel;
+};
+
+struct nouveau_fence_cb {
+	struct nouveau_fence_cb *next;
+	void (*func)(void *);
+	void *priv;
+};
+
+struct nouveau_fence_priv {
+	struct nouveau_fence base;
+	int refcount;
+
+	struct nouveau_fence *next;
+	struct nouveau_fence_cb *signal_cb;
+
+	uint32_t sequence;
+	int emitted;
+	int signalled;
+};
+#define nouveau_fence(n) ((struct nouveau_fence_priv *)(n))
+
+extern int
+nouveau_fence_new(struct nouveau_channel *, struct nouveau_fence **);
+
+extern int
+nouveau_fence_ref(struct nouveau_fence *, struct nouveau_fence **);
+
+extern int
+nouveau_fence_signal_cb(struct nouveau_fence *, void (*)(void *), void *);
+
+extern void
+nouveau_fence_emit(struct nouveau_fence *);
+
+extern int
+nouveau_fence_wait(struct nouveau_fence **);
+
+extern void
+nouveau_fence_flush(struct nouveau_channel *);
+
+struct nouveau_pushbuf_reloc {
+	struct nouveau_pushbuf_bo *pbbo;
+	uint32_t *ptr;
+	uint32_t flags;
+	uint32_t data;
+	uint32_t vor;
+	uint32_t tor;
+};
+
+struct nouveau_pushbuf_bo {
+	struct nouveau_channel *channel;
+	struct nouveau_bo *bo;
+	unsigned flags;
+	unsigned handled;
+};
+
+#define NOUVEAU_PUSHBUF_MAX_BUFFERS 1024
+#define NOUVEAU_PUSHBUF_MAX_RELOCS 1024
+struct nouveau_pushbuf_priv {
+	struct nouveau_pushbuf base;
+
+	struct nouveau_fence *fence;
+
+	unsigned nop_jump;
+	unsigned start;
+	unsigned size;
+
+	struct nouveau_pushbuf_bo *buffers;
+	unsigned nr_buffers;
+	struct nouveau_pushbuf_reloc *relocs;
+	unsigned nr_relocs;
+};
+#define nouveau_pushbuf(n) ((struct nouveau_pushbuf_priv *)(n))
+
+#define pbbo_to_ptr(o) ((uint64_t)(unsigned long)(o))
+#define ptr_to_pbbo(h) ((struct nouveau_pushbuf_bo *)(unsigned long)(h))
+#define pbrel_to_ptr(o) ((uint64_t)(unsigned long)(o))
+#define ptr_to_pbrel(h) ((struct nouveau_pushbuf_reloc *)(unsigned long)(h))
+#define bo_to_ptr(o) ((uint64_t)(unsigned long)(o))
+#define ptr_to_bo(h) ((struct nouveau_bo_priv *)(unsigned long)(h))
+
+extern int
+nouveau_pushbuf_init(struct nouveau_channel *);
+
+extern int
+nouveau_pushbuf_flush(struct nouveau_channel *, unsigned min);
+
+extern int
+nouveau_pushbuf_emit_reloc(struct nouveau_channel *, void *ptr,
+			   struct nouveau_bo *, uint32_t data, uint32_t flags,
+			   uint32_t vor, uint32_t tor);
+
+struct nouveau_dma_priv {
+	uint32_t base;
+	uint32_t max;
+	uint32_t cur;
+	uint32_t put;
+	uint32_t free;
+
+	int push_free;
+} dma;
+
+struct nouveau_channel_priv {
+	struct nouveau_channel base;
+
+	struct drm_nouveau_channel_alloc drm;
+
+	uint32_t *pushbuf;
+	void     *notifier_block;
+
+	volatile uint32_t *user;
+	volatile uint32_t *put;
+	volatile uint32_t *get;
+	volatile uint32_t *ref_cnt;
+
+	struct nouveau_dma_priv dma_master;
+	struct nouveau_dma_priv dma_bufmgr;
+	struct nouveau_dma_priv *dma;
+
+	struct nouveau_fence *fence_head;
+	struct nouveau_fence *fence_tail;
+	uint32_t fence_sequence;
+
+	struct nouveau_pushbuf_priv pb;
+
+	unsigned user_charge;
+};
+#define nouveau_channel(n) ((struct nouveau_channel_priv *)(n))
+
+extern int
+nouveau_channel_alloc(struct nouveau_device *, uint32_t fb, uint32_t tt,
+		      struct nouveau_channel **);
+
+extern void
+nouveau_channel_free(struct nouveau_channel **);
+
+struct nouveau_grobj_priv {
+	struct nouveau_grobj base;
+};
+#define nouveau_grobj(n) ((struct nouveau_grobj_priv *)(n))
+
+extern int nouveau_grobj_alloc(struct nouveau_channel *, uint32_t handle,
+			       int class, struct nouveau_grobj **);
+extern int nouveau_grobj_ref(struct nouveau_channel *, uint32_t handle,
+			     struct nouveau_grobj **);
+extern void nouveau_grobj_free(struct nouveau_grobj **);
+
+
+struct nouveau_notifier_priv {
+	struct nouveau_notifier base;
+
+	struct drm_nouveau_notifierobj_alloc drm;
+	volatile void *map;
+};
+#define nouveau_notifier(n) ((struct nouveau_notifier_priv *)(n))
+
+extern int
+nouveau_notifier_alloc(struct nouveau_channel *, uint32_t handle, int count,
+		       struct nouveau_notifier **);
+
+extern void
+nouveau_notifier_free(struct nouveau_notifier **);
+
+extern void
+nouveau_notifier_reset(struct nouveau_notifier *, int id);
+
+extern uint32_t
+nouveau_notifier_status(struct nouveau_notifier *, int id);
+
+extern uint32_t
+nouveau_notifier_return_val(struct nouveau_notifier *, int id);
+
+extern int
+nouveau_notifier_wait_status(struct nouveau_notifier *, int id, int status,
+			     int timeout);
+
+struct nouveau_bo_priv {
+	struct nouveau_bo base;
+
+	struct nouveau_pushbuf_bo *pending;
+	struct nouveau_fence *fence;
+	struct nouveau_fence *wr_fence;
+
+	struct drm_nouveau_mem_alloc drm;
+	void *map;
+
+	void *sysmem;
+	int user;
+
+	int refcount;
+
+	uint64_t offset;
+	uint64_t flags;
+	int tiled;
+};
+#define nouveau_bo(n) ((struct nouveau_bo_priv *)(n))
+
+extern int
+nouveau_bo_init(struct nouveau_device *);
+
+extern void
+nouveau_bo_takedown(struct nouveau_device *);
+
+extern int
+nouveau_bo_new(struct nouveau_device *, uint32_t flags, int align, int size,
+	       struct nouveau_bo **);
+
+extern int
+nouveau_bo_user(struct nouveau_device *, void *ptr, int size,
+		struct nouveau_bo **);
+
+extern int
+nouveau_bo_ref(struct nouveau_device *, uint64_t handle, struct nouveau_bo **);
+
+extern int
+nouveau_bo_set_status(struct nouveau_bo *, uint32_t flags);
+
+extern void
+nouveau_bo_del(struct nouveau_bo **);
+
+extern int
+nouveau_bo_map(struct nouveau_bo *, uint32_t flags);
+
+extern void
+nouveau_bo_unmap(struct nouveau_bo *);
+
+extern int
+nouveau_bo_validate(struct nouveau_channel *, struct nouveau_bo *,
+		    uint32_t flags);
+
+extern int
+nouveau_resource_init(struct nouveau_resource **heap, unsigned start,
+		      unsigned size);
+
+extern int
+nouveau_resource_alloc(struct nouveau_resource *heap, int size, void *priv,
+		       struct nouveau_resource **);
+
+extern void
+nouveau_resource_free(struct nouveau_resource **);
+
+#endif
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_fence.c b/src/gallium/winsys/drm/nouveau/nouveau_fence.c
new file mode 100644
index 00000000000..e7b0b4ff079
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_fence.c
@@ -0,0 +1,214 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "nouveau_drmif.h"
+#include "nouveau_dma.h"
+#include "nouveau_local.h"
+
+static void
+nouveau_fence_del_unsignalled(struct nouveau_fence *fence)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(fence->channel);
+	struct nouveau_fence *le;
+
+	if (nvchan->fence_head == fence) {
+		nvchan->fence_head = nouveau_fence(fence)->next;
+		if (nvchan->fence_head == NULL)
+			nvchan->fence_tail = NULL;
+		return;
+	}
+
+	le = nvchan->fence_head;
+	while (le && nouveau_fence(le)->next != fence)
+		le = nouveau_fence(le)->next;
+	assert(le && nouveau_fence(le)->next == fence);
+	nouveau_fence(le)->next = nouveau_fence(fence)->next;
+	if (nvchan->fence_tail == fence)
+		nvchan->fence_tail = le;
+}
+
+static void
+nouveau_fence_del(struct nouveau_fence **fence)
+{
+	struct nouveau_fence_priv *nvfence;
+
+	if (!fence || !*fence)
+		return;
+	nvfence = nouveau_fence(*fence);
+	*fence = NULL;
+
+	if (--nvfence->refcount)
+		return;
+
+	if (nvfence->emitted && !nvfence->signalled) {
+		if (nvfence->signal_cb) {
+			nvfence->refcount++;
+			nouveau_fence_wait((void *)&nvfence);
+			return;
+		}
+
+		nouveau_fence_del_unsignalled(&nvfence->base);
+	}
+	free(nvfence);
+}
+
+int
+nouveau_fence_new(struct nouveau_channel *chan, struct nouveau_fence **fence)
+{
+	struct nouveau_fence_priv *nvfence;
+
+	if (!chan || !fence || *fence)
+		return -EINVAL;
+	
+	nvfence = calloc(1, sizeof(struct nouveau_fence_priv));
+	if (!nvfence)
+		return -ENOMEM;
+	nvfence->base.channel = chan;
+	nvfence->refcount = 1;
+
+	*fence = &nvfence->base;
+	return 0;
+}
+
+int
+nouveau_fence_ref(struct nouveau_fence *ref, struct nouveau_fence **fence)
+{
+	struct nouveau_fence_priv *nvfence;
+
+	if (!fence)
+		return -EINVAL;
+
+	if (*fence) {
+		nouveau_fence_del(fence);
+		*fence = NULL;
+	}
+
+	if (ref) {
+		nvfence = nouveau_fence(ref);
+		nvfence->refcount++;	
+		*fence = &nvfence->base;
+	}
+
+	return 0;
+}
+
+int
+nouveau_fence_signal_cb(struct nouveau_fence *fence, void (*func)(void *),
+			void *priv)
+{
+	struct nouveau_fence_priv *nvfence = nouveau_fence(fence);
+	struct nouveau_fence_cb *cb;
+
+	if (!nvfence || !func)
+		return -EINVAL;
+
+	cb = malloc(sizeof(struct nouveau_fence_cb));
+	if (!cb)
+		return -ENOMEM;
+
+	cb->func = func;
+	cb->priv = priv;
+	cb->next = nvfence->signal_cb;
+	nvfence->signal_cb = cb;
+	return 0;
+}
+
+void
+nouveau_fence_emit(struct nouveau_fence *fence)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(fence->channel);
+	struct nouveau_fence_priv *nvfence = nouveau_fence(fence);
+
+	nvfence->emitted = 1;
+	nvfence->sequence = ++nvchan->fence_sequence;
+	if (nvfence->sequence == 0xffffffff)
+		NOUVEAU_ERR("AII wrap unhandled\n");
+
+	/*XXX: assumes subc 0 is populated */
+	RING_SPACE_CH(fence->channel, 2);
+	OUT_RING_CH  (fence->channel, 0x00040050);
+	OUT_RING_CH  (fence->channel, nvfence->sequence);
+
+	if (nvchan->fence_tail) {
+		nouveau_fence(nvchan->fence_tail)->next = fence;
+	} else {
+		nvchan->fence_head = fence;
+	}
+	nvchan->fence_tail = fence;
+}
+
+void
+nouveau_fence_flush(struct nouveau_channel *chan)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
+	uint32_t sequence = *nvchan->ref_cnt;
+
+	while (nvchan->fence_head) {
+		struct nouveau_fence_priv *nvfence;
+	
+		nvfence = nouveau_fence(nvchan->fence_head);
+		if (nvfence->sequence > sequence)
+			break;
+		nouveau_fence_del_unsignalled(&nvfence->base);
+		nvfence->signalled = 1;
+
+		if (nvfence->signal_cb) {
+			struct nouveau_fence *fence = NULL;
+
+			nouveau_fence_ref(&nvfence->base, &fence);
+
+			while (nvfence->signal_cb) {
+				struct nouveau_fence_cb *cb;
+				
+				cb = nvfence->signal_cb;
+				nvfence->signal_cb = cb->next;
+				cb->func(cb->priv);
+				free(cb);
+			}
+
+			nouveau_fence_ref(NULL, &fence);
+		}
+	}
+}
+
+int
+nouveau_fence_wait(struct nouveau_fence **fence)
+{
+	struct nouveau_fence_priv *nvfence;
+	
+	if (!fence || !*fence)
+		return -EINVAL;
+	nvfence = nouveau_fence(*fence);
+
+	if (nvfence->emitted) {
+		while (!nvfence->signalled)
+			nouveau_fence_flush(nvfence->base.channel);
+	}
+	nouveau_fence_ref(NULL, fence);
+
+	return 0;
+}
+
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_grobj.c b/src/gallium/winsys/drm/nouveau/nouveau_grobj.c
new file mode 100644
index 00000000000..51523897d58
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_grobj.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+
+#include "nouveau_drmif.h"
+
+int
+nouveau_grobj_alloc(struct nouveau_channel *chan, uint32_t handle,
+		    int class, struct nouveau_grobj **grobj)
+{
+	struct nouveau_device_priv *nvdev = nouveau_device(chan->device);
+	struct nouveau_grobj_priv *nvgrobj;
+	struct drm_nouveau_grobj_alloc g;
+	int ret;
+
+	if (!nvdev || !grobj || *grobj)
+		return -EINVAL;
+
+	nvgrobj = calloc(1, sizeof(*nvgrobj));
+	if (!nvgrobj)
+		return -ENOMEM;
+	nvgrobj->base.channel = chan;
+	nvgrobj->base.handle  = handle;
+	nvgrobj->base.grclass = class;
+
+	g.channel = chan->id;
+	g.handle  = handle;
+	g.class   = class;
+	ret = drmCommandWrite(nvdev->fd, DRM_NOUVEAU_GROBJ_ALLOC,
+			      &g, sizeof(g));
+	if (ret) {
+		nouveau_grobj_free((void *)&nvgrobj);
+		return ret;
+	}
+
+	*grobj = &nvgrobj->base;
+	return 0;
+}
+
+int
+nouveau_grobj_ref(struct nouveau_channel *chan, uint32_t handle,
+		  struct nouveau_grobj **grobj)
+{
+	struct nouveau_grobj_priv *nvgrobj;
+
+	if (!chan || !grobj || *grobj)
+		return -EINVAL;
+
+	nvgrobj = calloc(1, sizeof(struct nouveau_grobj_priv));
+	if (!nvgrobj)
+		return -ENOMEM;
+	nvgrobj->base.channel = chan;
+	nvgrobj->base.handle = handle;
+	nvgrobj->base.grclass = 0;
+
+	*grobj = &nvgrobj->base;
+	return 0;
+}
+
+void
+nouveau_grobj_free(struct nouveau_grobj **grobj)
+{
+	struct nouveau_device_priv *nvdev;
+	struct nouveau_channel_priv *chan;
+	struct nouveau_grobj_priv *nvgrobj;
+
+	if (!grobj || !*grobj)
+		return;
+	nvgrobj = nouveau_grobj(*grobj);
+	*grobj = NULL;
+
+
+	chan = nouveau_channel(nvgrobj->base.channel);
+	nvdev = nouveau_device(chan->base.device);
+
+	if (nvgrobj->base.grclass) {
+		struct drm_nouveau_gpuobj_free f;
+
+		f.channel = chan->drm.channel;
+		f.handle  = nvgrobj->base.handle;
+		drmCommandWrite(nvdev->fd, DRM_NOUVEAU_GPUOBJ_FREE,
+				&f, sizeof(f));	
+	}
+	free(nvgrobj);
+}
+
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_local.h b/src/gallium/winsys/drm/nouveau/nouveau_local.h
new file mode 100644
index 00000000000..e878a408037
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_local.h
@@ -0,0 +1,117 @@
+#ifndef __NOUVEAU_LOCAL_H__
+#define __NOUVEAU_LOCAL_H__
+
+#include "pipe/p_compiler.h"
+#include "nouveau_winsys_pipe.h"
+#include <stdio.h>
+
+struct pipe_buffer;
+
+/* Debug output */
+#define NOUVEAU_MSG(fmt, args...) do {                                         \
+	fprintf(stdout, "nouveau: "fmt, ##args);                               \
+	fflush(stdout);                                                        \
+} while(0)
+
+#define NOUVEAU_ERR(fmt, args...) do {                                         \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);           \
+	fflush(stderr);                                                        \
+} while(0)
+
+#define NOUVEAU_TIME_MSEC() 0
+
+/* User FIFO control */
+//#define NOUVEAU_DMA_TRACE
+//#define NOUVEAU_DMA_DEBUG
+//#define NOUVEAU_DMA_DUMP_POSTRELOC_PUSHBUF
+#define NOUVEAU_DMA_BARRIER 
+#define NOUVEAU_DMA_TIMEOUT 2000
+
+/* Push buffer access macros */
+static INLINE void
+OUT_RING(struct nouveau_channel *chan, unsigned data)
+{
+	*(chan->pushbuf->cur++) = (data);
+}
+
+static INLINE void
+OUT_RINGp(struct nouveau_channel *chan, uint32_t *data, unsigned size)
+{
+	memcpy(chan->pushbuf->cur, data, size * 4);
+	chan->pushbuf->cur += size;
+}
+
+static INLINE void
+OUT_RINGf(struct nouveau_channel *chan, float f)
+{
+	union { uint32_t i; float f; } c;
+	c.f = f;
+	OUT_RING(chan, c.i);
+}
+
+static INLINE void
+BEGIN_RING(struct nouveau_channel *chan, struct nouveau_grobj *gr,
+	   unsigned mthd, unsigned size)
+{
+	if (chan->pushbuf->remaining < (size + 1))
+		nouveau_pushbuf_flush(chan, (size + 1));
+	OUT_RING(chan, (gr->subc << 13) | (size << 18) | mthd);
+	chan->pushbuf->remaining -= (size + 1);
+}
+
+static INLINE void
+FIRE_RING(struct nouveau_channel *chan)
+{
+	nouveau_pushbuf_flush(chan, 0);
+}
+
+static INLINE void
+BIND_RING(struct nouveau_channel *chan, struct nouveau_grobj *gr, unsigned subc)
+{
+	gr->subc = subc;
+	BEGIN_RING(chan, gr, 0x0000, 1);
+	OUT_RING  (chan, gr->handle);
+}
+
+static INLINE void
+OUT_RELOC(struct nouveau_channel *chan, struct nouveau_bo *bo,
+	  unsigned data, unsigned flags, unsigned vor, unsigned tor)
+{
+	nouveau_pushbuf_emit_reloc(chan, chan->pushbuf->cur++, bo,
+				   data, flags, vor, tor);
+}
+
+/* Raw data + flags depending on FB/TT buffer */
+static INLINE void
+OUT_RELOCd(struct nouveau_channel *chan, struct nouveau_bo *bo,
+	   unsigned data, unsigned flags, unsigned vor, unsigned tor)
+{
+	OUT_RELOC(chan, bo, data, flags | NOUVEAU_BO_OR, vor, tor);
+}
+
+/* FB/TT object handle */
+static INLINE void
+OUT_RELOCo(struct nouveau_channel *chan, struct nouveau_bo *bo,
+	   unsigned flags)
+{
+	OUT_RELOC(chan, bo, 0, flags | NOUVEAU_BO_OR,
+		  chan->vram->handle, chan->gart->handle);
+}
+
+/* Low 32-bits of offset */
+static INLINE void
+OUT_RELOCl(struct nouveau_channel *chan, struct nouveau_bo *bo,
+	   unsigned delta, unsigned flags)
+{
+	OUT_RELOC(chan, bo, delta, flags | NOUVEAU_BO_LOW, 0, 0);
+}
+
+/* High 32-bits of offset */
+static INLINE void
+OUT_RELOCh(struct nouveau_channel *chan, struct nouveau_bo *bo,
+	   unsigned delta, unsigned flags)
+{
+	OUT_RELOC(chan, bo, delta, flags | NOUVEAU_BO_HIGH, 0, 0);
+}
+
+#endif
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_lock.c b/src/gallium/winsys/drm/nouveau/nouveau_lock.c
new file mode 100644
index 00000000000..9adb9ac8547
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_lock.c
@@ -0,0 +1,94 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "main/glheader.h"
+#include "glapi/glthread.h"
+#include <GL/internal/glcore.h>
+
+#include "nouveau_context.h"
+#include "nouveau_screen.h"
+
+_glthread_DECLARE_STATIC_MUTEX( lockMutex );
+
+static void
+nouveau_contended_lock(struct nouveau_context *nv, GLuint flags)
+{
+	__DRIdrawablePrivate *dPriv = nv->dri_drawable;
+	__DRIscreenPrivate *sPriv = nv->dri_screen;
+	struct nouveau_screen *nv_screen = nv->nv_screen;
+	struct nouveau_device *dev = nv_screen->device;
+	struct nouveau_device_priv *nvdev = nouveau_device(dev);
+
+	drmGetLock(nvdev->fd, nvdev->ctx, flags);
+
+	/* If the window moved, may need to set a new cliprect now.
+	 *
+	 * NOTE: This releases and regains the hw lock, so all state
+	 * checking must be done *after* this call:
+	 */
+	if (dPriv)
+		DRI_VALIDATE_DRAWABLE_INFO(sPriv, dPriv);
+}
+
+/* Lock the hardware and validate our state.
+ */
+void
+LOCK_HARDWARE(struct nouveau_context *nv)
+{
+	struct nouveau_screen *nv_screen = nv->nv_screen;
+	struct nouveau_device *dev = nv_screen->device;
+	struct nouveau_device_priv *nvdev = nouveau_device(dev);
+	char __ret=0;
+
+	_glthread_LOCK_MUTEX(lockMutex);
+	assert(!nv->locked);
+	
+	DRM_CAS(nvdev->lock, nvdev->ctx,
+		(DRM_LOCK_HELD | nvdev->ctx), __ret);
+	
+	if (__ret)
+		nouveau_contended_lock(nv, 0);
+	nv->locked = GL_TRUE;
+}
+
+
+  /* Unlock the hardware using the global current context 
+   */
+void
+UNLOCK_HARDWARE(struct nouveau_context *nv)
+{
+	struct nouveau_screen *nv_screen = nv->nv_screen;
+	struct nouveau_device *dev = nv_screen->device;
+	struct nouveau_device_priv *nvdev = nouveau_device(dev);
+
+	assert(nv->locked);
+	nv->locked = GL_FALSE;
+
+	DRM_UNLOCK(nvdev->fd, nvdev->lock, nvdev->ctx);
+
+	_glthread_UNLOCK_MUTEX(lockMutex);
+} 
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_notifier.c b/src/gallium/winsys/drm/nouveau/nouveau_notifier.c
new file mode 100644
index 00000000000..01e8f38440e
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_notifier.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+
+#include "nouveau_drmif.h"
+#include "nouveau_local.h"
+
+#define NOTIFIER(__v)                                                          \
+	struct nouveau_notifier_priv *nvnotify = nouveau_notifier(notifier);   \
+	volatile uint32_t *__v = (void*)nvnotify->map + (id * 32)
+
+int
+nouveau_notifier_alloc(struct nouveau_channel *chan, uint32_t handle,
+		       int count, struct nouveau_notifier **notifier)
+{
+	struct nouveau_notifier_priv *nvnotify;
+	int ret;
+
+	if (!chan || !notifier || *notifier)
+		return -EINVAL;
+
+	nvnotify = calloc(1, sizeof(struct nouveau_notifier_priv));
+	if (!nvnotify)
+		return -ENOMEM;
+	nvnotify->base.channel = chan;
+	nvnotify->base.handle  = handle;
+
+	nvnotify->drm.channel = chan->id;
+	nvnotify->drm.handle  = handle;
+	nvnotify->drm.count   = count;
+	if ((ret = drmCommandWriteRead(nouveau_device(chan->device)->fd,
+				       DRM_NOUVEAU_NOTIFIEROBJ_ALLOC,
+				       &nvnotify->drm,
+				       sizeof(nvnotify->drm)))) {
+		nouveau_notifier_free((void *)&nvnotify);
+		return ret;
+	}
+
+	nvnotify->map = (void *)nouveau_channel(chan)->notifier_block +
+				nvnotify->drm.offset;
+	*notifier = &nvnotify->base;
+	return 0;
+}
+
+void
+nouveau_notifier_free(struct nouveau_notifier **notifier)
+{
+
+	struct nouveau_notifier_priv *nvnotify;
+	struct nouveau_channel_priv *nvchan;
+	struct nouveau_device_priv *nvdev;
+	struct drm_nouveau_gpuobj_free f;
+
+	if (!notifier || !*notifier)
+		return;
+	nvnotify = nouveau_notifier(*notifier);
+	*notifier = NULL;
+
+	nvchan = nouveau_channel(nvnotify->base.channel);
+	nvdev   = nouveau_device(nvchan->base.device);
+
+	f.channel = nvchan->drm.channel;
+	f.handle  = nvnotify->base.handle;
+	drmCommandWrite(nvdev->fd, DRM_NOUVEAU_GPUOBJ_FREE, &f, sizeof(f));		
+	free(nvnotify);
+}
+
+void
+nouveau_notifier_reset(struct nouveau_notifier *notifier, int id)
+{
+	NOTIFIER(n);
+
+	n[NV_NOTIFY_TIME_0      /4] = 0x00000000;
+	n[NV_NOTIFY_TIME_1      /4] = 0x00000000;
+	n[NV_NOTIFY_RETURN_VALUE/4] = 0x00000000;
+	n[NV_NOTIFY_STATE       /4] = (NV_NOTIFY_STATE_STATUS_IN_PROCESS <<
+				       NV_NOTIFY_STATE_STATUS_SHIFT);
+}
+
+uint32_t
+nouveau_notifier_status(struct nouveau_notifier *notifier, int id)
+{
+	NOTIFIER(n);
+
+	return n[NV_NOTIFY_STATE/4] >> NV_NOTIFY_STATE_STATUS_SHIFT;
+}
+
+uint32_t
+nouveau_notifier_return_val(struct nouveau_notifier *notifier, int id)
+{
+	NOTIFIER(n);
+
+	return n[NV_NOTIFY_RETURN_VALUE/4];
+}
+
+int
+nouveau_notifier_wait_status(struct nouveau_notifier *notifier, int id,
+			     int status, int timeout)
+{
+	NOTIFIER(n);
+	uint32_t time = 0, t_start = NOUVEAU_TIME_MSEC();
+
+	while (time <= timeout) {
+		uint32_t v;
+
+		v = n[NV_NOTIFY_STATE/4] >> NV_NOTIFY_STATE_STATUS_SHIFT;
+		if (v == status)
+			return 0;
+
+		if (timeout)
+			time = NOUVEAU_TIME_MSEC() - t_start;
+	}
+
+	return -EBUSY;
+}
+
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_pushbuf.c b/src/gallium/winsys/drm/nouveau/nouveau_pushbuf.c
new file mode 100644
index 00000000000..815046ba85f
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_pushbuf.c
@@ -0,0 +1,271 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "nouveau_drmif.h"
+#include "nouveau_dma.h"
+
+#define PB_BUFMGR_DWORDS   (4096 / 2)
+#define PB_MIN_USER_DWORDS  2048
+
+static int
+nouveau_pushbuf_space(struct nouveau_channel *chan, unsigned min)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
+	struct nouveau_pushbuf_priv *nvpb = &nvchan->pb;
+
+	assert((min + 1) <= nvchan->dma->max);
+
+	/* Wait for enough space in push buffer */
+	min = min < PB_MIN_USER_DWORDS ? PB_MIN_USER_DWORDS : min;
+	min += 1; /* a bit extra for the NOP */
+	if (nvchan->dma->free < min)
+		WAIT_RING_CH(chan, min);
+
+	/* Insert NOP, may turn into a jump later */
+	RING_SPACE_CH(chan, 1);
+	nvpb->nop_jump = nvchan->dma->cur;
+	OUT_RING_CH(chan, 0);
+
+	/* Any remaining space is available to the user */
+	nvpb->start = nvchan->dma->cur;
+	nvpb->size = nvchan->dma->free;
+	nvpb->base.channel = chan;
+	nvpb->base.remaining = nvpb->size;
+	nvpb->base.cur = &nvchan->pushbuf[nvpb->start];
+
+	/* Create a new fence object for this "frame" */
+	nouveau_fence_ref(NULL, &nvpb->fence);
+	nouveau_fence_new(chan, &nvpb->fence);
+
+	return 0;
+}
+
+int
+nouveau_pushbuf_init(struct nouveau_channel *chan)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
+	struct nouveau_dma_priv *m = &nvchan->dma_master;
+	struct nouveau_dma_priv *b = &nvchan->dma_bufmgr;
+	int i;
+
+	if (!nvchan)
+		return -EINVAL;
+
+	/* Reassign last bit of push buffer for a "separate" bufmgr
+	 * ring buffer
+	 */
+	m->max -= PB_BUFMGR_DWORDS;
+	m->free -= PB_BUFMGR_DWORDS;
+
+	b->base = m->base + ((m->max + 2) << 2);
+	b->max = PB_BUFMGR_DWORDS - 2;
+	b->cur = b->put = 0;
+	b->free = b->max - b->cur;
+
+	/* Some NOPs just to be safe
+	 *XXX: RING_SKIPS
+	 */
+	nvchan->dma = b;
+	RING_SPACE_CH(chan, 8);
+	for (i = 0; i < 8; i++)
+		OUT_RING_CH(chan, 0);
+	nvchan->dma = m;
+
+	nouveau_pushbuf_space(chan, 0);
+	chan->pushbuf = &nvchan->pb.base;
+
+	nvchan->pb.buffers = calloc(NOUVEAU_PUSHBUF_MAX_BUFFERS,
+				    sizeof(struct nouveau_pushbuf_bo));
+	nvchan->pb.relocs = calloc(NOUVEAU_PUSHBUF_MAX_RELOCS,
+				   sizeof(struct nouveau_pushbuf_reloc));
+	return 0;
+}
+
+static uint32_t
+nouveau_pushbuf_calc_reloc(struct nouveau_bo *bo,
+			   struct nouveau_pushbuf_reloc *r)
+{
+	uint32_t push;
+
+	if (r->flags & NOUVEAU_BO_LOW) {
+		push = bo->offset + r->data;
+	} else
+	if (r->flags & NOUVEAU_BO_HIGH) {
+		push = (bo->offset + r->data) >> 32;
+	} else {
+		push = r->data;
+	}
+
+	if (r->flags & NOUVEAU_BO_OR) {
+		if (bo->flags & NOUVEAU_BO_VRAM)
+			push |= r->vor;
+		else
+			push |= r->tor;
+	}
+
+	return push;
+}
+
+/* This would be our TTM "superioctl" */
+int
+nouveau_pushbuf_flush(struct nouveau_channel *chan, unsigned min)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
+	struct nouveau_pushbuf_priv *nvpb = &nvchan->pb;
+	int ret, i;
+
+	if (nvpb->base.remaining == nvpb->size)
+		return 0;
+
+	nouveau_fence_flush(chan);
+
+	nvpb->size -= nvpb->base.remaining;
+	nvchan->dma->cur += nvpb->size;
+	nvchan->dma->free -= nvpb->size;
+	assert(nvchan->dma->cur <= nvchan->dma->max);
+
+	nvchan->dma = &nvchan->dma_bufmgr;
+	nvchan->pushbuf[nvpb->nop_jump] = 0x20000000 |
+		(nvchan->dma->base + (nvchan->dma->cur << 2));
+
+	/* Validate buffers + apply relocations */
+	nvchan->user_charge = 0;
+	for (i = 0; i < nvpb->nr_relocs; i++) {
+		struct nouveau_pushbuf_reloc *r = &nvpb->relocs[i];
+		struct nouveau_pushbuf_bo *pbbo = r->pbbo;
+		struct nouveau_bo *bo = pbbo->bo;
+
+		/* Validated, mem matches presumed, no relocation necessary */
+		if (pbbo->handled & 2) {
+			if (!(pbbo->handled & 1))
+				assert(0);
+			continue;
+		}
+
+		/* Not yet validated, do it now */
+		if (!(pbbo->handled & 1)) {
+			ret = nouveau_bo_validate(chan, bo, pbbo->flags);
+			if (ret) {
+				assert(0);
+				return ret;
+			}
+			pbbo->handled |= 1;
+
+			if (bo->offset == nouveau_bo(bo)->offset &&
+			    bo->flags == nouveau_bo(bo)->flags) {
+				pbbo->handled |= 2;
+				continue;
+			}
+			bo->offset = nouveau_bo(bo)->offset;
+			bo->flags = nouveau_bo(bo)->flags;
+		}
+
+		/* Apply the relocation */
+		*r->ptr = nouveau_pushbuf_calc_reloc(bo, r);
+	}
+	nvpb->nr_relocs = 0;
+
+	/* Dereference all buffers on validate list */
+	for (i = 0; i < nvpb->nr_buffers; i++) {
+		struct nouveau_pushbuf_bo *pbbo = &nvpb->buffers[i];
+
+		nouveau_bo(pbbo->bo)->pending = NULL;
+		nouveau_bo_del(&pbbo->bo);
+	}
+	nvpb->nr_buffers = 0;
+
+	/* Switch back to user's ring */
+	RING_SPACE_CH(chan, 1);
+	OUT_RING_CH(chan, 0x20000000 | ((nvpb->start << 2) +
+					nvchan->dma_master.base));
+	nvchan->dma = &nvchan->dma_master;
+
+	/* Fence + kickoff */
+	nouveau_fence_emit(nvpb->fence);
+	FIRE_RING_CH(chan);
+
+	/* Allocate space for next push buffer */
+	ret = nouveau_pushbuf_space(chan, min);
+	assert(!ret);
+
+	return 0;
+}
+
+static struct nouveau_pushbuf_bo *
+nouveau_pushbuf_emit_buffer(struct nouveau_channel *chan, struct nouveau_bo *bo)
+{
+	struct nouveau_pushbuf_priv *nvpb = nouveau_pushbuf(chan->pushbuf);
+	struct nouveau_bo_priv *nvbo = nouveau_bo(bo);
+	struct nouveau_pushbuf_bo *pbbo;
+
+	if (nvbo->pending)
+		return nvbo->pending;
+
+	if (nvpb->nr_buffers >= NOUVEAU_PUSHBUF_MAX_BUFFERS)
+		return NULL;
+	pbbo = nvpb->buffers + nvpb->nr_buffers++;
+	nvbo->pending = pbbo;
+
+	nouveau_bo_ref(bo->device, bo->handle, &pbbo->bo);
+	pbbo->channel = chan;
+	pbbo->flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART;
+	pbbo->handled = 0;
+	return pbbo;
+}
+
+int
+nouveau_pushbuf_emit_reloc(struct nouveau_channel *chan, void *ptr,
+			   struct nouveau_bo *bo, uint32_t data, uint32_t flags,
+			   uint32_t vor, uint32_t tor)
+{
+	struct nouveau_pushbuf_priv *nvpb = nouveau_pushbuf(chan->pushbuf);
+	struct nouveau_pushbuf_bo *pbbo;
+	struct nouveau_pushbuf_reloc *r;
+
+	if (nvpb->nr_relocs >= NOUVEAU_PUSHBUF_MAX_RELOCS)
+		return -ENOMEM;
+
+	pbbo = nouveau_pushbuf_emit_buffer(chan, bo);
+	if (!pbbo)
+		return -ENOMEM;
+	pbbo->flags |= (flags & NOUVEAU_BO_RDWR);
+	pbbo->flags &= (flags | NOUVEAU_BO_RDWR);
+
+	r = nvpb->relocs + nvpb->nr_relocs++;
+	r->pbbo = pbbo;
+	r->ptr = ptr;
+	r->flags = flags;
+	r->data = data;
+	r->vor = vor;
+	r->tor = tor;
+
+	if (flags & NOUVEAU_BO_DUMMY)
+		*(uint32_t *)ptr = 0;
+	else
+		*(uint32_t *)ptr = nouveau_pushbuf_calc_reloc(bo, r);
+	return 0;
+}
+
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_resource.c b/src/gallium/winsys/drm/nouveau/nouveau_resource.c
new file mode 100644
index 00000000000..3bbcb5c45e0
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_resource.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+
+#include "nouveau_drmif.h"
+#include "nouveau_local.h"
+
+int
+nouveau_resource_init(struct nouveau_resource **heap,
+		      unsigned start, unsigned size)
+{
+	struct nouveau_resource *r;
+
+	r = calloc(1, sizeof(struct nouveau_resource));
+	if (!r)
+		return 1;
+
+	r->start = start;
+	r->size  = size;
+	*heap = r;
+	return 0;
+}
+
+int
+nouveau_resource_alloc(struct nouveau_resource *heap, int size, void *priv,
+		       struct nouveau_resource **res)
+{
+	struct nouveau_resource *r;
+
+	if (!heap || !size || !res || *res)
+		return 1;
+
+	while (heap) {
+		if (!heap->in_use && heap->size >= size) {
+			r = calloc(1, sizeof(struct nouveau_resource));
+			if (!r)
+				return 1;
+
+			r->start  = (heap->start + heap->size) - size;
+			r->size   = size;
+			r->in_use = 1;
+			r->priv   = priv;
+
+			heap->size -= size;
+
+			r->next = heap->next;
+			if (heap->next)
+				heap->next->prev = r;
+			r->prev = heap;
+			heap->next = r;
+
+			*res = r;
+			return 0;
+		}
+			
+		heap = heap->next;
+	}
+
+	return 1;
+}
+
+void
+nouveau_resource_free(struct nouveau_resource **res)
+{
+	struct nouveau_resource *r;
+
+	if (!res || !*res)
+		return;
+	r = *res;
+	*res = NULL;
+
+	r->in_use = 0;
+
+	if (r->next && !r->next->in_use) {
+		struct nouveau_resource *new = r->next;
+
+		new->prev = r->prev;
+		if (r->prev)
+			r->prev->next = new;
+		new->size += r->size;
+		new->start = r->start;
+
+		free(r);
+		r = new;
+	}
+
+	if (r->prev && !r->prev->in_use) {
+		r->prev->next = r->next;
+		if (r->next)
+			r->next->prev = r->prev;
+		r->prev->size += r->size;
+		free(r);
+	}
+	
+}
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_screen.c b/src/gallium/winsys/drm/nouveau/nouveau_screen.c
new file mode 100644
index 00000000000..c6d0c535887
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_screen.c
@@ -0,0 +1,265 @@
+#include "utils.h"
+#include "vblank.h"
+#include "xmlpool.h"
+
+#include "pipe/p_context.h"
+#include "state_tracker/st_public.h"
+#include "state_tracker/st_cb_fbo.h"
+
+#include "nouveau_context.h"
+#include "nouveau_drm.h"
+#include "nouveau_dri.h"
+#include "nouveau_local.h"
+#include "nouveau_screen.h"
+#include "nouveau_swapbuffers.h"
+
+#if NOUVEAU_DRM_HEADER_PATCHLEVEL != 11
+#error nouveau_drm.h version does not match expected version
+#endif
+
+/* Extension stuff, enabling of extensions handled by Gallium's GL state
+ * tracker.  But, we still need to define the entry points we want.
+ */
+#define need_GL_ARB_fragment_program
+#define need_GL_ARB_multisample
+#define need_GL_ARB_occlusion_query
+#define need_GL_ARB_point_parameters
+#define need_GL_ARB_shader_objects
+#define need_GL_ARB_texture_compression
+#define need_GL_ARB_vertex_program
+#define need_GL_ARB_vertex_shader
+#define need_GL_ARB_vertex_buffer_object
+#define need_GL_EXT_compiled_vertex_array
+#define need_GL_EXT_fog_coord
+#define need_GL_EXT_secondary_color
+#define need_GL_EXT_framebuffer_object
+#define need_GL_VERSION_2_0
+#define need_GL_VERSION_2_1
+#include "extension_helper.h"
+
+const struct dri_extension card_extensions[] =
+{
+	{ "GL_ARB_multisample", GL_ARB_multisample_functions },
+	{ "GL_ARB_occlusion_query", GL_ARB_occlusion_query_functions },
+	{ "GL_ARB_point_parameters", GL_ARB_point_parameters_functions },
+	{ "GL_ARB_shader_objects", GL_ARB_shader_objects_functions },
+	{ "GL_ARB_shading_language_100", GL_VERSION_2_0_functions },
+	{ "GL_ARB_shading_language_120", GL_VERSION_2_1_functions },
+	{ "GL_ARB_texture_compression", GL_ARB_texture_compression_functions },
+	{ "GL_ARB_vertex_program", GL_ARB_vertex_program_functions },
+	{ "GL_ARB_vertex_shader", GL_ARB_vertex_shader_functions },
+	{ "GL_ARB_vertex_buffer_object", GL_ARB_vertex_buffer_object_functions },
+	{ "GL_EXT_compiled_vertex_array", GL_EXT_compiled_vertex_array_functions },
+	{ "GL_EXT_fog_coord", GL_EXT_fog_coord_functions },
+	{ "GL_EXT_framebuffer_object", GL_EXT_framebuffer_object_functions },
+	{ "GL_EXT_secondary_color", GL_EXT_secondary_color_functions },
+	{ NULL, 0 }
+};
+
+PUBLIC const char __driConfigOptions[] =
+DRI_CONF_BEGIN
+DRI_CONF_END;
+static const GLuint __driNConfigOptions = 0;
+
+extern const struct dri_extension common_extensions[];
+extern const struct dri_extension nv40_extensions[];
+
+static GLboolean
+nouveau_create_buffer(__DRIscreenPrivate * driScrnPriv,
+		      __DRIdrawablePrivate * driDrawPriv,
+		      const __GLcontextModes *glVis, GLboolean pixmapBuffer)
+{
+	struct nouveau_framebuffer *nvfb;
+	enum pipe_format colour, depth, stencil;
+
+	if (pixmapBuffer)
+		return GL_FALSE;
+
+	nvfb = CALLOC_STRUCT(nouveau_framebuffer);
+	if (!nvfb)
+		return GL_FALSE;
+
+	if (glVis->redBits == 5)
+		colour = PIPE_FORMAT_R5G6B5_UNORM;
+	else
+		colour = PIPE_FORMAT_A8R8G8B8_UNORM;
+
+	if (glVis->depthBits == 16)
+		depth = PIPE_FORMAT_Z16_UNORM;
+	else if (glVis->depthBits == 24)
+		depth = PIPE_FORMAT_Z24S8_UNORM;
+	else
+		depth = PIPE_FORMAT_NONE;
+
+	if (glVis->stencilBits == 8)
+		stencil = PIPE_FORMAT_Z24S8_UNORM;
+	else
+		stencil = PIPE_FORMAT_NONE;
+
+	nvfb->stfb = st_create_framebuffer(glVis, colour, depth, stencil,
+					   driDrawPriv->w, driDrawPriv->h,
+					   (void*)nvfb);
+	if (!nvfb->stfb) {
+		free(nvfb);
+		return  GL_FALSE;
+	}
+
+	driDrawPriv->driverPrivate = (void *)nvfb;
+	return GL_TRUE;
+}
+
+static void
+nouveau_destroy_buffer(__DRIdrawablePrivate * driDrawPriv)
+{
+	struct nouveau_framebuffer *nvfb;
+	
+	nvfb = (struct nouveau_framebuffer *)driDrawPriv->driverPrivate;
+	st_unreference_framebuffer(nvfb->stfb);
+	free(nvfb);
+}
+
+static __DRIconfig **
+nouveau_fill_in_modes(__DRIscreenPrivate *psp,
+		      unsigned pixel_bits, unsigned depth_bits,
+		      unsigned stencil_bits, GLboolean have_back_buffer)
+{
+	__DRIconfig **configs;
+	unsigned depth_buffer_factor;
+	unsigned back_buffer_factor;
+	GLenum fb_format;
+	GLenum fb_type;
+
+	static const GLenum back_buffer_modes[] = {
+		GLX_NONE, GLX_SWAP_UNDEFINED_OML,
+	};
+
+	uint8_t depth_bits_array[3];
+	uint8_t stencil_bits_array[3];
+	uint8_t msaa_samples_array[1];
+
+	depth_bits_array[0] = 0;
+	depth_bits_array[1] = depth_bits;
+	depth_bits_array[2] = depth_bits;
+
+	/* Just like with the accumulation buffer, always provide some modes
+	 * with a stencil buffer.  It will be a sw fallback, but some apps won't
+	 * care about that.
+	 */
+	stencil_bits_array[0] = 0;
+	stencil_bits_array[1] = 0;
+	if (depth_bits == 24)
+		stencil_bits_array[1] = (stencil_bits == 0) ? 8 : stencil_bits;
+	stencil_bits_array[2] = (stencil_bits == 0) ? 8 : stencil_bits;
+
+	msaa_samples_array[0] = 0;
+
+	depth_buffer_factor =
+		((depth_bits != 0) || (stencil_bits != 0)) ? 3 : 1;
+	back_buffer_factor = (have_back_buffer) ? 3 : 1;
+
+	if (pixel_bits == 16) {
+		fb_format = GL_RGB;
+		fb_type = GL_UNSIGNED_SHORT_5_6_5;
+	}
+	else {
+		fb_format = GL_BGRA;
+		fb_type = GL_UNSIGNED_INT_8_8_8_8_REV;
+	}
+
+	configs = driCreateConfigs(fb_format, fb_type,
+				   depth_bits_array, stencil_bits_array,
+				   depth_buffer_factor, back_buffer_modes,
+				   back_buffer_factor, msaa_samples_array, 1);
+	if (configs == NULL) {
+	 fprintf(stderr, "[%s:%u] Error creating FBConfig!\n",
+			 __func__, __LINE__);
+		return NULL;
+	}
+
+	return configs;
+}
+
+static const __DRIconfig **
+nouveau_screen_create(__DRIscreenPrivate *psp)
+{
+	struct nouveau_dri *nv_dri = psp->pDevPriv;
+	struct nouveau_screen *nv_screen;
+	static const __DRIversion ddx_expected =
+		{ 0, 0, NOUVEAU_DRM_HEADER_PATCHLEVEL };
+	static const __DRIversion dri_expected = { 4, 0, 0 };
+	static const __DRIversion drm_expected =
+		{ 0, 0, NOUVEAU_DRM_HEADER_PATCHLEVEL };
+	int ret;
+
+	if (!driCheckDriDdxDrmVersions2("nouveau",
+					&psp->dri_version, &dri_expected,
+					&psp->ddx_version, &ddx_expected,
+					&psp->drm_version, &drm_expected)) {
+		return NULL;
+	}
+
+	if (drm_expected.patch != psp->drm_version.patch) {
+		fprintf(stderr, "Incompatible DRM patch level.\n"
+				"Expected: %d\n" "Current : %d\n",
+			drm_expected.patch, psp->drm_version.patch);
+		return NULL;
+	}
+
+	driInitExtensions(NULL, card_extensions, GL_FALSE);
+
+	if (psp->devPrivSize != sizeof(struct nouveau_dri)) {
+		NOUVEAU_ERR("DRI struct mismatch between DDX/DRI\n");
+		return GL_FALSE;
+	}
+
+	nv_screen = CALLOC_STRUCT(nouveau_screen);
+	if (!nv_screen)
+		return GL_FALSE;
+	nv_screen->driScrnPriv = psp;
+	psp->private = (void *)nv_screen;
+
+	driParseOptionInfo(&nv_screen->option_cache,
+			   __driConfigOptions, __driNConfigOptions);
+
+	if ((ret = nouveau_device_open_existing(&nv_screen->device, 0,
+						psp->fd, 0))) {
+		NOUVEAU_ERR("Failed opening nouveau device: %d\n", ret);
+		return GL_FALSE;
+	}
+
+	nv_screen->front_offset = nv_dri->front_offset;
+	nv_screen->front_pitch  = nv_dri->front_pitch * (nv_dri->bpp / 8);
+	nv_screen->front_cpp = nv_dri->bpp / 8;
+	nv_screen->front_height = nv_dri->height;
+
+	return (const __DRIconfig **)
+		nouveau_fill_in_modes(psp, nv_dri->bpp,
+				      (nv_dri->bpp == 16) ? 16 : 24,
+				      (nv_dri->bpp == 16) ? 0 : 8, 1);
+}
+
+static void
+nouveau_screen_destroy(__DRIscreenPrivate *driScrnPriv)
+{
+	struct nouveau_screen *nv_screen = driScrnPriv->private;
+
+	driScrnPriv->private = NULL;
+	FREE(nv_screen);
+}
+
+const struct __DriverAPIRec
+driDriverAPI = {
+	.InitScreen	= nouveau_screen_create,
+	.DestroyScreen	= nouveau_screen_destroy,
+	.CreateContext	= nouveau_context_create,
+	.DestroyContext	= nouveau_context_destroy,
+	.CreateBuffer	= nouveau_create_buffer,
+	.DestroyBuffer	= nouveau_destroy_buffer,
+	.SwapBuffers	= nouveau_swap_buffers,
+	.MakeCurrent	= nouveau_context_bind,
+	.UnbindContext	= nouveau_context_unbind,
+	.CopySubBuffer	= nouveau_copy_sub_buffer,
+
+	.InitScreen2	= NULL, /* one day, I promise! */
+};
+
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_screen.h b/src/gallium/winsys/drm/nouveau/nouveau_screen.h
new file mode 100644
index 00000000000..388d6be9bbc
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_screen.h
@@ -0,0 +1,20 @@
+#ifndef __NOUVEAU_SCREEN_H__
+#define __NOUVEAU_SCREEN_H__
+
+#include "xmlconfig.h"
+
+struct nouveau_screen {
+	__DRIscreenPrivate *driScrnPriv;
+	driOptionCache      option_cache;
+
+	struct nouveau_device *device;
+
+	uint32_t front_offset;
+	uint32_t front_pitch;
+	uint32_t front_cpp;
+	uint32_t front_height;
+
+	void *nvc;
+};
+
+#endif
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_swapbuffers.c b/src/gallium/winsys/drm/nouveau/nouveau_swapbuffers.c
new file mode 100644
index 00000000000..70e0104e83b
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_swapbuffers.c
@@ -0,0 +1,86 @@
+#include "main/glheader.h"
+#include "glapi/glthread.h"
+#include <GL/internal/glcore.h>
+
+#include "pipe/p_context.h"
+#include "state_tracker/st_public.h"
+#include "state_tracker/st_context.h"
+#include "state_tracker/st_cb_fbo.h"
+
+#include "nouveau_context.h"
+#include "nouveau_local.h"
+#include "nouveau_screen.h"
+#include "nouveau_swapbuffers.h"
+
+void
+nouveau_copy_buffer(__DRIdrawablePrivate *dPriv, struct pipe_surface *surf,
+		    const drm_clip_rect_t *rect)
+{
+	struct nouveau_context *nv = dPriv->driContextPriv->driverPrivate;
+	drm_clip_rect_t *pbox;
+	int nbox, i;
+
+	LOCK_HARDWARE(nv);
+	if (!dPriv->numClipRects) {
+		UNLOCK_HARDWARE(nv);
+		return;
+	}
+	pbox = dPriv->pClipRects;
+	nbox = dPriv->numClipRects;
+
+	nv->surface_copy_prep(nv, nv->frontbuffer, surf);
+	for (i = 0; i < nbox; i++, pbox++) {
+		int sx, sy, dx, dy, w, h;
+
+		sx = pbox->x1 - dPriv->x;
+		sy = pbox->y1 - dPriv->y;
+		dx = pbox->x1;
+		dy = pbox->y1;
+		w  = pbox->x2 - pbox->x1;
+		h  = pbox->y2 - pbox->y1;
+
+		nv->surface_copy(nv, dx, dy, sx, sy, w, h);
+	}
+
+	FIRE_RING(nv->nvc->channel);
+	UNLOCK_HARDWARE(nv);
+
+	if (nv->last_stamp != dPriv->lastStamp) {
+		struct nouveau_framebuffer *nvfb = dPriv->driverPrivate;
+		st_resize_framebuffer(nvfb->stfb, dPriv->w, dPriv->h);
+		nv->last_stamp = dPriv->lastStamp;
+	}
+}
+
+void
+nouveau_copy_sub_buffer(__DRIdrawablePrivate *dPriv, int x, int y, int w, int h)
+{
+	struct nouveau_framebuffer *nvfb = dPriv->driverPrivate;
+	struct pipe_surface *surf;
+
+	surf = st_get_framebuffer_surface(nvfb->stfb, ST_SURFACE_BACK_LEFT);
+	if (surf) {
+		drm_clip_rect_t rect;
+		rect.x1 = x;
+		rect.y1 = y;
+		rect.x2 = x + w;
+		rect.y2 = y + h;
+
+		st_notify_swapbuffers(nvfb->stfb);
+		nouveau_copy_buffer(dPriv, surf, &rect);
+	}
+}
+
+void
+nouveau_swap_buffers(__DRIdrawablePrivate *dPriv)
+{
+	struct nouveau_framebuffer *nvfb = dPriv->driverPrivate;
+	struct pipe_surface *surf;
+
+	surf = st_get_framebuffer_surface(nvfb->stfb, ST_SURFACE_BACK_LEFT);
+	if (surf) {
+		st_notify_swapbuffers(nvfb->stfb);
+		nouveau_copy_buffer(dPriv, surf, NULL);
+	}
+}
+
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_swapbuffers.h b/src/gallium/winsys/drm/nouveau/nouveau_swapbuffers.h
new file mode 100644
index 00000000000..825d3da6da5
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_swapbuffers.h
@@ -0,0 +1,10 @@
+#ifndef __NOUVEAU_SWAPBUFFERS_H__
+#define __NOUVEAU_SWAPBUFFERS_H__
+
+extern void nouveau_copy_buffer(__DRIdrawablePrivate *, struct pipe_surface *,
+				const drm_clip_rect_t *);
+extern void nouveau_copy_sub_buffer(__DRIdrawablePrivate *,
+				    int x, int y, int w, int h);
+extern void nouveau_swap_buffers(__DRIdrawablePrivate *);
+
+#endif
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_winsys.c b/src/gallium/winsys/drm/nouveau/nouveau_winsys.c
new file mode 100644
index 00000000000..364340e1d3d
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_winsys.c
@@ -0,0 +1,161 @@
+#include "util/u_memory.h"
+
+#include "nouveau_context.h"
+#include "nouveau_screen.h"
+#include "nouveau_winsys_pipe.h"
+
+#include "nouveau/nouveau_winsys.h"
+
+static int
+nouveau_pipe_notifier_alloc(struct nouveau_winsys *nvws, int count,
+			    struct nouveau_notifier **notify)
+{
+	struct nouveau_context *nv = nvws->nv;
+
+	return nouveau_notifier_alloc(nv->nvc->channel, nv->nvc->next_handle++,
+				      count, notify);
+}
+
+static int
+nouveau_pipe_grobj_alloc(struct nouveau_winsys *nvws, int grclass,
+			 struct nouveau_grobj **grobj)
+{
+	struct nouveau_context *nv = nvws->nv;
+	struct nouveau_channel *chan = nv->nvc->channel;
+	int ret;
+
+	ret = nouveau_grobj_alloc(chan, nv->nvc->next_handle++,
+				  grclass, grobj);
+	if (ret)
+		return ret;
+
+	assert(nv->nvc->next_subchannel < 7);
+	BIND_RING(chan, *grobj, nv->nvc->next_subchannel++);
+	return 0;
+}
+
+static int
+nouveau_pipe_surface_copy(struct nouveau_winsys *nvws, struct pipe_surface *dst,
+			  unsigned dx, unsigned dy, struct pipe_surface *src,
+			  unsigned sx, unsigned sy, unsigned w, unsigned h)
+{
+	struct nouveau_context *nv = nvws->nv;
+
+	if (nv->surface_copy_prep(nv, dst, src))
+		return 1;
+	nv->surface_copy(nv, dx, dy, sx, sy, w, h);
+	nv->surface_copy_done(nv);
+
+	return 0;
+}
+
+static int
+nouveau_pipe_surface_fill(struct nouveau_winsys *nvws, struct pipe_surface *dst,
+			  unsigned dx, unsigned dy, unsigned w, unsigned h,
+			  unsigned value)
+{
+	if (nvws->nv->surface_fill(nvws->nv, dst, dx, dy, w, h, value))
+		return 1;
+	return 0;
+}
+
+static int
+nouveau_pipe_push_reloc(struct nouveau_winsys *nvws, void *ptr,
+			struct pipe_buffer *buf, uint32_t data,
+			uint32_t flags, uint32_t vor, uint32_t tor)
+{
+	return nouveau_pushbuf_emit_reloc(nvws->channel, ptr,
+					  nouveau_buffer(buf)->bo,
+					  data, flags, vor, tor);
+}
+
+static int
+nouveau_pipe_push_flush(struct nouveau_winsys *nvws, unsigned size,
+			struct pipe_fence_handle **fence)
+{
+	if (fence) {
+		struct nouveau_pushbuf *pb = nvws->channel->pushbuf;
+		struct nouveau_pushbuf_priv *nvpb = nouveau_pushbuf(pb);
+		struct nouveau_fence *ref = NULL;
+
+		nouveau_fence_ref(nvpb->fence, &ref);
+		*fence = (struct pipe_fence_handle *)ref;
+	}
+
+	return nouveau_pushbuf_flush(nvws->channel, size);
+}
+
+struct pipe_context *
+nouveau_pipe_create(struct nouveau_context *nv)
+{
+	struct nouveau_channel_context *nvc = nv->nvc;
+	struct nouveau_winsys *nvws = CALLOC_STRUCT(nouveau_winsys);
+	struct pipe_screen *(*hws_create)(struct pipe_winsys *,
+					  struct nouveau_winsys *);
+	struct pipe_context *(*hw_create)(struct pipe_screen *, unsigned);
+	struct pipe_winsys *ws;
+	unsigned chipset = nv->nv_screen->device->chipset;
+
+	if (!nvws)
+		return NULL;
+
+	switch (chipset & 0xf0) {
+	case 0x10:
+		hws_create = nv10_screen_create;
+		hw_create = nv10_create;
+		break;
+	case 0x20:
+		hws_create = nv20_screen_create;
+		hw_create = nv20_create;
+		break;
+	case 0x30:
+		hws_create = nv30_screen_create;
+		hw_create = nv30_create;
+		break;
+	case 0x40:
+	case 0x60:
+		hws_create = nv40_screen_create;
+		hw_create = nv40_create;
+		break;
+	case 0x50:
+	case 0x80:
+	case 0x90:
+		hws_create = nv50_screen_create;
+		hw_create = nv50_create;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown chipset NV%02x\n", chipset);
+		return NULL;
+	}
+
+	nvws->nv		= nv;
+	nvws->channel		= nv->nvc->channel;
+
+	nvws->res_init		= nouveau_resource_init;
+	nvws->res_alloc		= nouveau_resource_alloc;
+	nvws->res_free		= nouveau_resource_free;
+
+	nvws->push_reloc        = nouveau_pipe_push_reloc;
+	nvws->push_flush	= nouveau_pipe_push_flush;
+
+	nvws->grobj_alloc	= nouveau_pipe_grobj_alloc;
+	nvws->grobj_free	= nouveau_grobj_free;
+
+	nvws->notifier_alloc	= nouveau_pipe_notifier_alloc;
+	nvws->notifier_free	= nouveau_notifier_free;
+	nvws->notifier_reset	= nouveau_notifier_reset;
+	nvws->notifier_status	= nouveau_notifier_status;
+	nvws->notifier_retval	= nouveau_notifier_return_val;
+	nvws->notifier_wait	= nouveau_notifier_wait_status;
+
+	nvws->surface_copy	= nouveau_pipe_surface_copy;
+	nvws->surface_fill	= nouveau_pipe_surface_fill;
+
+	ws = nouveau_create_pipe_winsys(nv);
+
+	if (!nvc->pscreen)
+		nvc->pscreen = hws_create(ws, nvws);
+	nvc->pctx[nv->pctx_id] = hw_create(nvc->pscreen, nv->pctx_id);
+	return nvc->pctx[nv->pctx_id];
+}
+
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_winsys_pipe.c b/src/gallium/winsys/drm/nouveau/nouveau_winsys_pipe.c
new file mode 100644
index 00000000000..5276806de6b
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_winsys_pipe.c
@@ -0,0 +1,206 @@
+#include "pipe/p_winsys.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "util/u_memory.h"
+
+#include "nouveau_context.h"
+#include "nouveau_local.h"
+#include "nouveau_screen.h"
+#include "nouveau_swapbuffers.h"
+#include "nouveau_winsys_pipe.h"
+
+static void
+nouveau_flush_frontbuffer(struct pipe_winsys *pws, struct pipe_surface *surf,
+			  void *context_private)
+{
+	struct nouveau_context *nv = context_private;
+	__DRIdrawablePrivate *dPriv = nv->dri_drawable;
+
+	nouveau_copy_buffer(dPriv, surf, NULL);
+}
+
+static const char *
+nouveau_get_name(struct pipe_winsys *pws)
+{
+	return "Nouveau/DRI";
+}
+
+static struct pipe_buffer *
+nouveau_pipe_bo_create(struct pipe_winsys *pws, unsigned alignment,
+		       unsigned usage, unsigned size)
+{
+	struct nouveau_pipe_winsys *nvpws = (struct nouveau_pipe_winsys *)pws;
+	struct nouveau_context *nv = nvpws->nv;
+	struct nouveau_device *dev = nv->nv_screen->device;
+	struct nouveau_pipe_buffer *nvbuf;
+	uint32_t flags;
+
+	nvbuf = calloc(1, sizeof(*nvbuf));
+	if (!nvbuf)
+		return NULL;
+	nvbuf->base.refcount = 1;
+	nvbuf->base.alignment = alignment;
+	nvbuf->base.usage = usage;
+	nvbuf->base.size = size;
+
+	flags = NOUVEAU_BO_LOCAL;
+
+	if (usage & PIPE_BUFFER_USAGE_PIXEL) {
+		if (usage & NOUVEAU_BUFFER_USAGE_TEXTURE)
+			flags |= NOUVEAU_BO_GART;
+		flags |= NOUVEAU_BO_VRAM;
+
+		switch (dev->chipset & 0xf0) {
+		case 0x50:
+		case 0x80:
+		case 0x90:
+			flags |= NOUVEAU_BO_TILED;
+			if (usage & NOUVEAU_BUFFER_USAGE_ZETA)
+				flags |= NOUVEAU_BO_ZTILE;
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (usage & PIPE_BUFFER_USAGE_VERTEX) {
+		if (nv->cap.hw_vertex_buffer)
+			flags |= NOUVEAU_BO_GART;
+	}
+
+	if (usage & PIPE_BUFFER_USAGE_INDEX) {
+		if (nv->cap.hw_index_buffer)
+			flags |= NOUVEAU_BO_GART;
+	}
+
+	if (nouveau_bo_new(dev, flags, alignment, size, &nvbuf->bo)) {
+		free(nvbuf);
+		return NULL;
+	}
+
+	return &nvbuf->base;
+}
+
+static struct pipe_buffer *
+nouveau_pipe_bo_user_create(struct pipe_winsys *pws, void *ptr, unsigned bytes)
+{
+	struct nouveau_pipe_winsys *nvpws = (struct nouveau_pipe_winsys *)pws;
+	struct nouveau_device *dev = nvpws->nv->nv_screen->device;
+	struct nouveau_pipe_buffer *nvbuf;
+
+	nvbuf = calloc(1, sizeof(*nvbuf));
+	if (!nvbuf)
+		return NULL;
+	nvbuf->base.refcount = 1;
+	nvbuf->base.size = bytes;
+
+	if (nouveau_bo_user(dev, ptr, bytes, &nvbuf->bo)) {
+		free(nvbuf);
+		return NULL;
+	}
+
+	return &nvbuf->base;
+}
+
+static void
+nouveau_pipe_bo_del(struct pipe_winsys *ws, struct pipe_buffer *buf)
+{
+	struct nouveau_pipe_buffer *nvbuf = nouveau_buffer(buf);
+
+	nouveau_bo_del(&nvbuf->bo);
+	free(nvbuf);
+}
+
+static void *
+nouveau_pipe_bo_map(struct pipe_winsys *pws, struct pipe_buffer *buf,
+		    unsigned flags)
+{
+	struct nouveau_pipe_buffer *nvbuf = nouveau_buffer(buf);
+	uint32_t map_flags = 0;
+
+	if (flags & PIPE_BUFFER_USAGE_CPU_READ)
+		map_flags |= NOUVEAU_BO_RD;
+	if (flags & PIPE_BUFFER_USAGE_CPU_WRITE)
+		map_flags |= NOUVEAU_BO_WR;
+
+	if (nouveau_bo_map(nvbuf->bo, map_flags))
+		return NULL;
+	return nvbuf->bo->map;
+}
+
+static void
+nouveau_pipe_bo_unmap(struct pipe_winsys *pws, struct pipe_buffer *buf)
+{
+	struct nouveau_pipe_buffer *nvbuf = nouveau_buffer(buf);
+
+	nouveau_bo_unmap(nvbuf->bo);
+}
+
+static INLINE struct nouveau_fence *
+nouveau_pipe_fence(struct pipe_fence_handle *pfence)
+{
+	return (struct nouveau_fence *)pfence;
+}
+
+static void
+nouveau_pipe_fence_reference(struct pipe_winsys *ws,
+			     struct pipe_fence_handle **ptr,
+			     struct pipe_fence_handle *pfence)
+{
+	nouveau_fence_ref((void *)pfence, (void *)ptr);
+}
+
+static int
+nouveau_pipe_fence_signalled(struct pipe_winsys *ws,
+			     struct pipe_fence_handle *pfence, unsigned flag)
+{
+	struct nouveau_pipe_winsys *nvpws = (struct nouveau_pipe_winsys *)ws;
+	struct nouveau_fence *fence = nouveau_pipe_fence(pfence);
+
+	if (nouveau_fence(fence)->signalled == 0)
+		nouveau_fence_flush(nvpws->nv->nvc->channel);
+
+	return !nouveau_fence(fence)->signalled;
+}
+
+static int
+nouveau_pipe_fence_finish(struct pipe_winsys *ws,
+			  struct pipe_fence_handle *pfence, unsigned flag)
+{
+	struct nouveau_fence *fence = nouveau_pipe_fence(pfence);
+	struct nouveau_fence *ref = NULL;
+
+	nouveau_fence_ref(fence, &ref);
+	return nouveau_fence_wait(&ref);
+}
+
+struct pipe_winsys *
+nouveau_create_pipe_winsys(struct nouveau_context *nv)
+{
+	struct nouveau_pipe_winsys *nvpws;
+	struct pipe_winsys *pws;
+
+	nvpws = CALLOC_STRUCT(nouveau_pipe_winsys);
+	if (!nvpws)
+		return NULL;
+	nvpws->nv = nv;
+	pws = &nvpws->pws;
+
+	pws->flush_frontbuffer = nouveau_flush_frontbuffer;
+
+	pws->buffer_create = nouveau_pipe_bo_create;
+	pws->buffer_destroy = nouveau_pipe_bo_del;
+	pws->user_buffer_create = nouveau_pipe_bo_user_create;
+	pws->buffer_map = nouveau_pipe_bo_map;
+	pws->buffer_unmap = nouveau_pipe_bo_unmap;
+
+	pws->fence_reference = nouveau_pipe_fence_reference;
+	pws->fence_signalled = nouveau_pipe_fence_signalled;
+	pws->fence_finish = nouveau_pipe_fence_finish;
+
+	pws->get_name = nouveau_get_name;
+
+	return &nvpws->pws;
+}
+
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_winsys_pipe.h b/src/gallium/winsys/drm/nouveau/nouveau_winsys_pipe.h
new file mode 100644
index 00000000000..6a03ac0d773
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_winsys_pipe.h
@@ -0,0 +1,34 @@
+#ifndef NOUVEAU_PIPE_WINSYS_H
+#define NOUVEAU_PIPE_WINSYS_H
+
+#include "pipe/p_context.h"
+#include "pipe/p_winsys.h"
+#include "nouveau_context.h"
+
+struct nouveau_pipe_buffer {
+	struct pipe_buffer base;
+	struct nouveau_bo *bo;
+};
+
+static inline struct nouveau_pipe_buffer *
+nouveau_buffer(struct pipe_buffer *buf)
+{
+	return (struct nouveau_pipe_buffer *)buf;
+}
+
+struct nouveau_pipe_winsys {
+	struct pipe_winsys pws;
+
+	struct nouveau_context *nv;
+};
+
+extern struct pipe_winsys *
+nouveau_create_pipe_winsys(struct nouveau_context *nv);
+
+struct pipe_context *
+nouveau_create_softpipe(struct nouveau_context *nv);
+
+struct pipe_context *
+nouveau_pipe_create(struct nouveau_context *nv);
+
+#endif
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_winsys_softpipe.c b/src/gallium/winsys/drm/nouveau/nouveau_winsys_softpipe.c
new file mode 100644
index 00000000000..704f6c77506
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_winsys_softpipe.c
@@ -0,0 +1,85 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND., USA
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * 
+ **************************************************************************/
+/*
+ * Authors: Keith Whitwell <keithw-at-tungstengraphics-dot-com>
+ */
+
+#include "imports.h"
+
+#include "pipe/p_defines.h"
+#include "pipe/p_format.h"
+#include "softpipe/sp_winsys.h"
+
+#include "nouveau_context.h"
+#include "nouveau_winsys_pipe.h"
+
+struct nouveau_softpipe_winsys {
+   struct softpipe_winsys sws;
+   struct nouveau_context *nv;
+};
+
+/**
+ * Return list of surface formats supported by this driver.
+ */
+static boolean
+nouveau_is_format_supported(struct softpipe_winsys *sws, uint format)
+{
+	switch (format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case PIPE_FORMAT_R5G6B5_UNORM:
+	case PIPE_FORMAT_Z24S8_UNORM:
+		return TRUE;
+	default:
+		break;
+	};
+
+	return FALSE;
+}
+
+struct pipe_context *
+nouveau_create_softpipe(struct nouveau_context *nv)
+{
+	struct nouveau_softpipe_winsys *nvsws;
+	struct pipe_screen *pscreen;
+	struct pipe_winsys *ws;
+
+	ws = nouveau_create_pipe_winsys(nv);
+	if (!ws)
+		return NULL;
+	pscreen = softpipe_create_screen(ws);
+
+	nvsws = CALLOC_STRUCT(nouveau_softpipe_winsys);
+	if (!nvsws)
+		return NULL;
+
+	nvsws->sws.is_format_supported = nouveau_is_format_supported;
+	nvsws->nv = nv;
+
+	return softpipe_create(pscreen, ws, &nvsws->sws);
+}
+
diff --git a/src/gallium/winsys/drm/nouveau/nv04_surface.c b/src/gallium/winsys/drm/nouveau/nv04_surface.c
new file mode 100644
index 00000000000..d08955de718
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nv04_surface.c
@@ -0,0 +1,413 @@
+#include "pipe/p_context.h"
+#include "pipe/p_format.h"
+
+#include "nouveau_context.h"
+
+static INLINE int log2i(int i)
+{
+	int r = 0;
+
+	if (i & 0xffff0000) {
+		i >>= 16;
+		r += 16;
+	}
+	if (i & 0x0000ff00) {
+		i >>= 8;
+		r += 8;
+	}
+	if (i & 0x000000f0) {
+		i >>= 4;
+		r += 4;
+	}
+	if (i & 0x0000000c) {
+		i >>= 2;
+		r += 2;
+	}
+	if (i & 0x00000002) {
+		r += 1;
+	}
+	return r;
+}
+
+static INLINE int
+nv04_surface_format(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_A8_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_Y8;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_R5G6B5;
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case PIPE_FORMAT_Z24S8_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_Y32;
+	default:
+		return -1;
+	}
+}
+
+static INLINE int
+nv04_rect_format(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_A8_UNORM:
+		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A16R5G6B5;
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case PIPE_FORMAT_Z24S8_UNORM:
+		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
+	default:
+		return -1;
+	}
+}
+
+static void
+nv04_surface_copy_m2mf(struct nouveau_context *nv, unsigned dx, unsigned dy,
+		       unsigned sx, unsigned sy, unsigned w, unsigned h)
+{
+	struct nouveau_channel *chan = nv->nvc->channel;
+	struct pipe_surface *dst = nv->surf_dst;
+	struct pipe_surface *src = nv->surf_src;
+	unsigned dst_offset, src_offset;
+
+	dst_offset = dst->offset + (dy * dst->stride) + (dx * dst->block.size);
+	src_offset = src->offset + (sy * src->stride) + (sx * src->block.size);
+
+	while (h) {
+		int count = (h > 2047) ? 2047 : h;
+
+		BEGIN_RING(chan, nv->nvc->NvM2MF,
+			   NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 8);
+		OUT_RELOCl(chan, nouveau_buffer(src->buffer)->bo, src_offset,
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+		OUT_RELOCl(chan, nouveau_buffer(dst->buffer)->bo, dst_offset,
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_WR);
+		OUT_RING  (chan, src->stride);
+		OUT_RING  (chan, dst->stride);
+		OUT_RING  (chan, w * src->block.size);
+		OUT_RING  (chan, count);
+		OUT_RING  (chan, 0x0101);
+		OUT_RING  (chan, 0);
+
+		h -= count;
+		src_offset += src->stride * count;
+		dst_offset += dst->stride * count;
+	}
+}
+
+static void
+nv04_surface_copy_blit(struct nouveau_context *nv, unsigned dx, unsigned dy,
+		       unsigned sx, unsigned sy, unsigned w, unsigned h)
+{
+	struct nouveau_channel *chan = nv->nvc->channel;
+
+	BEGIN_RING(chan, nv->nvc->NvImageBlit, 0x0300, 3);
+	OUT_RING  (chan, (sy << 16) | sx);
+	OUT_RING  (chan, (dy << 16) | dx);
+	OUT_RING  (chan, ( h << 16) |  w);
+}
+
+static int
+nv04_surface_copy_prep_swizzled(struct nouveau_context *nv,
+				struct pipe_surface *dst,
+				struct pipe_surface *src)
+{
+	struct nouveau_channel *chan = nv->nvc->channel;
+
+	BEGIN_RING(chan, nv->nvc->NvSwzSurf,
+		   NV04_SWIZZLED_SURFACE_FORMAT, 2);
+	/* FIXME: read destination format from somewhere */
+	OUT_RING  (chan,
+		NV04_SWIZZLED_SURFACE_FORMAT_COLOR_A8R8G8B8
+		| (log2i(dst->width)<<NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_U_SHIFT)
+		| (log2i(dst->height)<<NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_V_SHIFT) );
+	OUT_RELOCo(chan, nouveau_buffer(dst->buffer)->bo,
+		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, nv->nvc->NvSIFM, NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION, 13);
+	OUT_RING  (chan, NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE);
+	/* FIXME: read source format from somewhere */
+	OUT_RING  (chan, NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A8R8G8B8);
+	OUT_RING  (chan, NV04_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, (src->height<<16) | src->width);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, (src->height<<16) | src->width);
+	OUT_RING  (chan, 1<<20);
+	OUT_RING  (chan, 1<<20);
+	OUT_RING  (chan, (src->height<<16) | src->width);
+	OUT_RING  (chan,
+		src->stride
+		| NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CENTER
+		| NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_POINT_SAMPLE);
+	OUT_RELOCo(chan, nouveau_buffer(src->buffer)->bo,
+		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RING  (chan, 0);
+
+	BEGIN_RING(chan, nv->nvc->NvM2MF,
+		   NV04_MEMORY_TO_MEMORY_FORMAT_DMA_BUFFER_IN, 2);
+	OUT_RELOCo(chan, nouveau_buffer(src->buffer)->bo,
+		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCo(chan, nouveau_buffer(dst->buffer)->bo,
+		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	nv->surface_copy = nv04_surface_copy_m2mf;
+	nv->surf_dst = dst;
+	nv->surf_src = src;
+	return 0;
+}
+
+static int
+nv04_surface_copy_prep(struct nouveau_context *nv, struct pipe_surface *dst,
+		       struct pipe_surface *src)
+{
+	struct nouveau_channel *chan = nv->nvc->channel;
+	int format;
+
+	if (src->format != dst->format)
+		return 1;
+
+	/* Setup transfer to swizzle the texture to vram if needed */
+	/* FIXME/TODO: check proper limits of this operation */
+	if (nouveau_buffer(dst->buffer)->bo->flags & NOUVEAU_BO_SWIZZLED) {
+		/* FIXME: Disable it for the moment */
+		/*return nv04_surface_copy_prep_swizzled(nv, dst, src);*/
+	}
+
+	/* NV_CONTEXT_SURFACES_2D has buffer alignment restrictions, fallback
+	 * to NV_MEMORY_TO_MEMORY_FORMAT in this case.
+	 */
+	if ((src->offset & 63) || (dst->offset & 63)) {
+		BEGIN_RING(nv->nvc->channel, nv->nvc->NvM2MF,
+			   NV04_MEMORY_TO_MEMORY_FORMAT_DMA_BUFFER_IN, 2);
+		OUT_RELOCo(chan, nouveau_buffer(src->buffer)->bo,
+			   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+		OUT_RELOCo(chan, nouveau_buffer(dst->buffer)->bo,
+			   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+		nv->surface_copy = nv04_surface_copy_m2mf;
+		nv->surf_dst = dst;
+		nv->surf_src = src;
+		return 0;
+
+	}
+
+	if ((format = nv04_surface_format(dst->format)) < 0) {
+		NOUVEAU_ERR("Bad surface format 0x%x\n", dst->format);
+		return 1;
+	}
+	nv->surface_copy = nv04_surface_copy_blit;
+
+	BEGIN_RING(chan, nv->nvc->NvCtxSurf2D,
+		   NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RELOCo(chan, nouveau_buffer(src->buffer)->bo,
+		   NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCo(chan, nouveau_buffer(dst->buffer)->bo,
+		   NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, nv->nvc->NvCtxSurf2D,
+		   NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
+	OUT_RING  (chan, format);
+	OUT_RING  (chan, (dst->stride << 16) | src->stride);
+	OUT_RELOCl(chan, nouveau_buffer(src->buffer)->bo, src->offset,
+		   NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCl(chan, nouveau_buffer(dst->buffer)->bo, dst->offset,
+		   NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	return 0;
+}
+
+static void
+nv04_surface_copy_done(struct nouveau_context *nv)
+{
+	FIRE_RING(nv->nvc->channel);
+}
+
+static int
+nv04_surface_fill(struct nouveau_context *nv, struct pipe_surface *dst,
+		  unsigned dx, unsigned dy, unsigned w, unsigned h,
+		  unsigned value)
+{
+	struct nouveau_channel *chan = nv->nvc->channel;
+	struct nouveau_grobj *surf2d = nv->nvc->NvCtxSurf2D;
+	struct nouveau_grobj *rect = nv->nvc->NvGdiRect;
+	int cs2d_format, gdirect_format;
+
+	if ((cs2d_format = nv04_surface_format(dst->format)) < 0) {
+		NOUVEAU_ERR("Bad format = %d\n", dst->format);
+		return 1;
+	}
+
+	if ((gdirect_format = nv04_rect_format(dst->format)) < 0) {
+		NOUVEAU_ERR("Bad format = %d\n", dst->format);
+		return 1;
+	}
+
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RELOCo(chan, nouveau_buffer(dst->buffer)->bo,
+		   NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCo(chan, nouveau_buffer(dst->buffer)->bo,
+		   NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
+	OUT_RING  (chan, cs2d_format);
+	OUT_RING  (chan, (dst->stride << 16) | dst->stride);
+	OUT_RELOCl(chan, nouveau_buffer(dst->buffer)->bo, dst->offset,
+		   NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCl(chan, nouveau_buffer(dst->buffer)->bo, dst->offset,
+		   NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT, 1);
+	OUT_RING  (chan, gdirect_format);
+	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR1_A, 1);
+	OUT_RING  (chan, value);
+	BEGIN_RING(chan, rect,
+		   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT(0), 2);
+	OUT_RING  (chan, (dx << 16) | dy);
+	OUT_RING  (chan, ( w << 16) |  h);
+
+	FIRE_RING(chan);
+	return 0;
+}
+
+int
+nouveau_surface_channel_create_nv04(struct nouveau_channel_context *nvc)
+{
+	struct nouveau_channel *chan = nvc->channel;
+	unsigned chipset = nvc->channel->device->chipset, class;
+	int ret;
+
+	if ((ret = nouveau_grobj_alloc(chan, nvc->next_handle++, 0x39,
+				       &nvc->NvM2MF))) {
+		NOUVEAU_ERR("Error creating m2mf object: %d\n", ret);
+		return 1;
+	}
+	BIND_RING (chan, nvc->NvM2MF, nvc->next_subchannel++);
+	BEGIN_RING(chan, nvc->NvM2MF,
+		   NV04_MEMORY_TO_MEMORY_FORMAT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, nvc->sync_notifier->handle);
+
+	class = chipset < 0x10 ? NV04_CONTEXT_SURFACES_2D :
+				 NV10_CONTEXT_SURFACES_2D;
+	if ((ret = nouveau_grobj_alloc(chan, nvc->next_handle++, class,
+				       &nvc->NvCtxSurf2D))) {
+		NOUVEAU_ERR("Error creating 2D surface object: %d\n", ret);
+		return 1;
+	}
+	BIND_RING (chan, nvc->NvCtxSurf2D, nvc->next_subchannel++);
+	BEGIN_RING(chan, nvc->NvCtxSurf2D,
+		   NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RING  (chan, nvc->channel->vram->handle);
+	OUT_RING  (chan, nvc->channel->vram->handle);
+
+	class = chipset < 0x10 ? NV04_IMAGE_BLIT : NV12_IMAGE_BLIT;
+	if ((ret = nouveau_grobj_alloc(chan, nvc->next_handle++, class,
+				       &nvc->NvImageBlit))) {
+		NOUVEAU_ERR("Error creating blit object: %d\n", ret);
+		return 1;
+	}
+	BIND_RING (chan, nvc->NvImageBlit, nvc->next_subchannel++);
+	BEGIN_RING(chan, nvc->NvImageBlit, NV04_IMAGE_BLIT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, nvc->sync_notifier->handle);
+	BEGIN_RING(chan, nvc->NvImageBlit, NV04_IMAGE_BLIT_SURFACE, 1);
+	OUT_RING  (chan, nvc->NvCtxSurf2D->handle);
+	BEGIN_RING(chan, nvc->NvImageBlit, NV04_IMAGE_BLIT_OPERATION, 1);
+	OUT_RING  (chan, NV04_IMAGE_BLIT_OPERATION_SRCCOPY);
+
+	class = NV04_GDI_RECTANGLE_TEXT;
+	if ((ret = nouveau_grobj_alloc(chan, nvc->next_handle++, class,
+				       &nvc->NvGdiRect))) {
+		NOUVEAU_ERR("Error creating rect object: %d\n", ret);
+		return 1;
+	}
+	BIND_RING (chan, nvc->NvGdiRect, nvc->next_subchannel++);
+	BEGIN_RING(chan, nvc->NvGdiRect, NV04_GDI_RECTANGLE_TEXT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, nvc->sync_notifier->handle);
+	BEGIN_RING(chan, nvc->NvGdiRect, NV04_GDI_RECTANGLE_TEXT_SURFACE, 1);
+	OUT_RING  (chan, nvc->NvCtxSurf2D->handle);
+	BEGIN_RING(chan, nvc->NvGdiRect, NV04_GDI_RECTANGLE_TEXT_OPERATION, 1);
+	OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_OPERATION_SRCCOPY);
+	BEGIN_RING(chan, nvc->NvGdiRect,
+		   NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT, 1);
+	OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT_LE);
+
+	switch (chipset & 0xf0) {
+	case 0x00:
+	case 0x10:
+		class = NV04_SWIZZLED_SURFACE;
+		break;
+	case 0x20:
+		class = NV20_SWIZZLED_SURFACE;
+		break;
+	case 0x30:
+		class = NV30_SWIZZLED_SURFACE;
+		break;
+	case 0x40:
+	case 0x60:
+		class = NV40_SWIZZLED_SURFACE;
+		break;
+	default:
+		/* Famous last words: this really can't happen.. */
+		assert(0);
+		break;
+	}
+
+	ret = nouveau_grobj_alloc(chan, nvc->next_handle++, class,
+				  &nvc->NvSwzSurf);
+	if (ret) {
+		NOUVEAU_ERR("Error creating swizzled surface: %d\n", ret);
+		return 1;
+	}
+
+	BIND_RING (chan, nvc->NvSwzSurf, nvc->next_subchannel++);
+	BEGIN_RING(chan, nvc->NvSwzSurf, NV04_SWIZZLED_SURFACE_DMA_NOTIFY, 1);
+	OUT_RING  (chan, nvc->sync_notifier->handle);
+	BEGIN_RING(chan, nvc->NvSwzSurf, NV04_SWIZZLED_SURFACE_DMA_IMAGE, 1);
+	OUT_RING  (chan, nvc->channel->vram->handle);
+
+	if (chipset < 0x10) {
+		class = NV04_SCALED_IMAGE_FROM_MEMORY;
+	} else
+	if (chipset < 0x40) {
+		class = NV10_SCALED_IMAGE_FROM_MEMORY;
+	} else {
+		class = NV40_SCALED_IMAGE_FROM_MEMORY;
+	}
+
+	ret = nouveau_grobj_alloc(chan, nvc->next_handle++, class,
+				  &nvc->NvSIFM);
+	if (ret) {
+		NOUVEAU_ERR("Error creating scaled image object: %d\n", ret);
+		return 1;
+	}
+
+	BIND_RING (chan, nvc->NvSIFM, nvc->next_subchannel++);
+	BEGIN_RING(chan, nvc->NvSIFM, NV04_SCALED_IMAGE_FROM_MEMORY_DMA_NOTIFY, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, nvc->NvSIFM, NV04_SCALED_IMAGE_FROM_MEMORY_DMA_IMAGE, 1);
+	OUT_RING  (chan, nvc->channel->vram->handle);
+	BEGIN_RING(chan, nvc->NvSIFM, NV04_SCALED_IMAGE_FROM_MEMORY_SURFACE, 1);
+	OUT_RING  (chan, nvc->NvSwzSurf->handle);
+	BEGIN_RING(chan, nvc->NvSIFM, NV04_SCALED_IMAGE_FROM_MEMORY_PATTERN, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, nvc->NvSIFM, NV04_SCALED_IMAGE_FROM_MEMORY_ROP, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, nvc->NvSIFM, NV04_SCALED_IMAGE_FROM_MEMORY_BETA1, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, nvc->NvSIFM, NV04_SCALED_IMAGE_FROM_MEMORY_BETA4, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, nvc->NvSIFM, NV04_SCALED_IMAGE_FROM_MEMORY_OPERATION, 1);
+	OUT_RING  (chan, NV04_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY);
+
+	return 0;
+}
+
+int
+nouveau_surface_init_nv04(struct nouveau_context *nv)
+{
+	nv->surface_copy_prep = nv04_surface_copy_prep;
+	nv->surface_copy = nv04_surface_copy_blit;
+	nv->surface_copy_done = nv04_surface_copy_done;
+	nv->surface_fill = nv04_surface_fill;
+	return 0;
+}
+
diff --git a/src/gallium/winsys/drm/nouveau/nv50_surface.c b/src/gallium/winsys/drm/nouveau/nv50_surface.c
new file mode 100644
index 00000000000..c8ab7f690f3
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nv50_surface.c
@@ -0,0 +1,194 @@
+#include "pipe/p_context.h"
+#include "pipe/p_format.h"
+
+#include "nouveau_context.h"
+
+static INLINE int
+nv50_format(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case PIPE_FORMAT_Z24S8_UNORM:
+		return NV50_2D_DST_FORMAT_32BPP;
+	case PIPE_FORMAT_X8R8G8B8_UNORM:
+		return NV50_2D_DST_FORMAT_24BPP;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		return NV50_2D_DST_FORMAT_16BPP;
+	case PIPE_FORMAT_A8_UNORM:
+		return NV50_2D_DST_FORMAT_8BPP;
+	default:
+		return -1;
+	}
+}
+
+static int
+nv50_surface_set(struct nouveau_context *nv, struct pipe_surface *surf, int dst)
+{
+	struct nouveau_channel *chan = nv->nvc->channel;
+	struct nouveau_grobj *eng2d = nv->nvc->Nv2D;
+ 	struct nouveau_bo *bo = nouveau_buffer(surf->buffer)->bo;
+ 	int surf_format, mthd = dst ? NV50_2D_DST_FORMAT : NV50_2D_SRC_FORMAT;
+ 	int flags = NOUVEAU_BO_VRAM | (dst ? NOUVEAU_BO_WR : NOUVEAU_BO_RD);
+  
+ 	surf_format = nv50_format(surf->format);
+ 	if (surf_format < 0)
+ 		return 1;
+  
+ 	if (!nouveau_bo(bo)->tiled) {
+ 		BEGIN_RING(chan, eng2d, mthd, 2);
+ 		OUT_RING  (chan, surf_format);
+ 		OUT_RING  (chan, 1);
+ 		BEGIN_RING(chan, eng2d, mthd + 0x14, 5);
+ 		OUT_RING  (chan, surf->stride);
+ 		OUT_RING  (chan, surf->width);
+ 		OUT_RING  (chan, surf->height);
+ 		OUT_RELOCh(chan, bo, surf->offset, flags);
+ 		OUT_RELOCl(chan, bo, surf->offset, flags);
+ 	} else {
+ 		BEGIN_RING(chan, eng2d, mthd, 5);
+ 		OUT_RING  (chan, surf_format);
+ 		OUT_RING  (chan, 0);
+ 		OUT_RING  (chan, 0);
+ 		OUT_RING  (chan, 1);
+ 		OUT_RING  (chan, 0);
+ 		BEGIN_RING(chan, eng2d, mthd + 0x18, 4);
+ 		OUT_RING  (chan, surf->width);
+ 		OUT_RING  (chan, surf->height);
+ 		OUT_RELOCh(chan, bo, surf->offset, flags);
+ 		OUT_RELOCl(chan, bo, surf->offset, flags);
+ 	}
+ 
+#if 0
+ 	if (dst) {
+ 		BEGIN_RING(chan, eng2d, NV50_2D_CLIP_X, 4);
+ 		OUT_RING  (chan, 0);
+ 		OUT_RING  (chan, 0);
+ 		OUT_RING  (chan, surf->width);
+ 		OUT_RING  (chan, surf->height);
+ 	}
+#endif
+  
+ 	return 0;
+}
+
+static int
+nv50_surface_copy_prep(struct nouveau_context *nv,
+		       struct pipe_surface *dst, struct pipe_surface *src)
+{
+	int ret;
+
+	assert(src->format == dst->format);
+
+	ret = nv50_surface_set(nv, dst, 1);
+	if (ret)
+		return ret;
+
+	ret = nv50_surface_set(nv, src, 0);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static void
+nv50_surface_copy(struct nouveau_context *nv, unsigned dx, unsigned dy,
+		  unsigned sx, unsigned sy, unsigned w, unsigned h)
+{
+	struct nouveau_channel *chan = nv->nvc->channel;
+	struct nouveau_grobj *eng2d = nv->nvc->Nv2D;
+
+	BEGIN_RING(chan, eng2d, 0x088c, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, eng2d, NV50_2D_BLIT_DST_X, 4);
+	OUT_RING  (chan, dx);
+	OUT_RING  (chan, dy);
+	OUT_RING  (chan, w);
+	OUT_RING  (chan, h);
+	BEGIN_RING(chan, eng2d, 0x08c0, 4);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 1);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 1);
+	BEGIN_RING(chan, eng2d, 0x08d0, 4);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, sx);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, sy);
+}
+
+static void
+nv50_surface_copy_done(struct nouveau_context *nv)
+{
+	FIRE_RING(nv->nvc->channel);
+}
+
+static int
+nv50_surface_fill(struct nouveau_context *nv, struct pipe_surface *dst,
+		  unsigned dx, unsigned dy, unsigned w, unsigned h,
+		  unsigned value)
+{
+	struct nouveau_channel *chan = nv->nvc->channel;
+	struct nouveau_grobj *eng2d = nv->nvc->Nv2D;
+	int rect_format, ret;
+
+	rect_format = nv50_format(dst->format);
+	if (rect_format < 0)
+		return 1;
+
+	ret = nv50_surface_set(nv, dst, 1);
+	if (ret)
+		return ret;
+
+	BEGIN_RING(chan, eng2d, 0x0580, 3);
+	OUT_RING  (chan, 4);
+	OUT_RING  (chan, rect_format);
+	OUT_RING  (chan, value);
+
+	BEGIN_RING(chan, eng2d, NV50_2D_RECT_X1, 4);
+	OUT_RING  (chan, dx);
+	OUT_RING  (chan, dy);
+	OUT_RING  (chan, dx + w);
+	OUT_RING  (chan, dy + h);
+
+	FIRE_RING(chan);
+	return 0;
+}
+
+int
+nouveau_surface_channel_create_nv50(struct nouveau_channel_context *nvc)
+{
+	struct nouveau_channel *chan = nvc->channel;
+	struct nouveau_grobj *eng2d = NULL;
+	int ret;
+
+	ret = nouveau_grobj_alloc(chan, nvc->next_handle++, NV50_2D, &eng2d);
+	if (ret)
+		return ret;
+	nvc->Nv2D = eng2d;
+
+	BIND_RING (chan, eng2d, nvc->next_subchannel++);
+	BEGIN_RING(chan, eng2d, NV50_2D_DMA_NOTIFY, 4);
+	OUT_RING  (chan, nvc->sync_notifier->handle);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->vram->handle);
+	BEGIN_RING(chan, eng2d, NV50_2D_OPERATION, 1);
+	OUT_RING  (chan, NV50_2D_OPERATION_SRCCOPY);
+	BEGIN_RING(chan, eng2d, 0x0290, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, eng2d, 0x0888, 1);
+	OUT_RING  (chan, 1);
+
+	return 0;
+}
+
+int
+nouveau_surface_init_nv50(struct nouveau_context *nv)
+{
+	nv->surface_copy_prep = nv50_surface_copy_prep;
+	nv->surface_copy = nv50_surface_copy;
+	nv->surface_copy_done = nv50_surface_copy_done;
+	nv->surface_fill = nv50_surface_fill;
+	return 0;
+}
+
diff --git a/src/gallium/winsys/egl_xlib/egl_xlib.c b/src/gallium/winsys/egl_xlib/egl_xlib.c
index e9f821d2764..477d766925c 100644
--- a/src/gallium/winsys/egl_xlib/egl_xlib.c
+++ b/src/gallium/winsys/egl_xlib/egl_xlib.c
@@ -537,7 +537,7 @@ xlib_eglDestroySurface(_EGLDriver *drv, EGLDisplay dpy, EGLSurface surface)
       }
       else {
          XFreeGC(surf->Dpy, surf->Gc);
-         st_unreference_framebuffer(&surf->Framebuffer);
+         st_unreference_framebuffer(surf->Framebuffer);
          free(surf);
       }
       return EGL_TRUE;
diff --git a/src/gallium/winsys/g3dvl/nouveau/Makefile b/src/gallium/winsys/g3dvl/nouveau/Makefile
new file mode 100644
index 00000000000..ff43327778a
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/Makefile
@@ -0,0 +1,50 @@
+TARGET		= libnouveau_dri.so
+GALLIUMDIR	= ../../..
+DRMDIR		?= /usr
+DRIDIR		= ../../../../driclient
+
+OBJECTS		= nouveau_bo.o nouveau_fence.o nouveau_swapbuffers.o nouveau_channel.o		\
+		  nouveau_grobj.o nouveau_context.o nouveau_winsys.o nouveau_lock.o		\
+		  nouveau_winsys_pipe.o nouveau_device.o nouveau_notifier.o nouveau_dma.o	\
+		  nouveau_pushbuf.o nouveau_resource.o nouveau_screen.o nv04_surface.o		\
+		  nv50_surface.o #nouveau_winsys_softpipe.o
+
+CFLAGS		+= -g -Wall -fPIC			\
+		   -I${GALLIUMDIR}/include		\
+		   -I${GALLIUMDIR}/winsys/g3dvl		\
+		   -I${DRMDIR}/include			\
+		   -I${DRMDIR}/include/drm		\
+		   -I${GALLIUMDIR}/drivers		\
+		   -I${GALLIUMDIR}/auxiliary		\
+		   -I${DRIDIR}/include
+
+LDFLAGS		+= -L${DRMDIR}/lib			\
+		   -L${DRIDIR}/lib			\
+		   -L${GALLIUMDIR}/auxiliary/draw	\
+		   -L${GALLIUMDIR}/auxiliary/tgsi	\
+		   -L${GALLIUMDIR}/auxiliary/translate	\
+		   -L${GALLIUMDIR}/auxiliary/rtasm	\
+		   -L${GALLIUMDIR}/auxiliary/cso_cache	\
+		   -L${GALLIUMDIR}/drivers/nv10		\
+		   -L${GALLIUMDIR}/drivers/nv20		\
+		   -L${GALLIUMDIR}/drivers/nv30		\
+		   -L${GALLIUMDIR}/drivers/nv40		\
+		   -L${GALLIUMDIR}/drivers/nv50
+
+LIBS		+= -ldriclient -ldrm -lnv10 -lnv20 -lnv30 -lnv40 -lnv50 -ldraw -ltgsi -ltranslate -lrtasm -lcso_cache -lm
+
+#############################################
+
+.PHONY	= all clean libdriclient
+
+all: ${TARGET}
+
+${TARGET}: ${OBJECTS} libdriclient
+	$(CC) ${LDFLAGS} -shared -o $@ ${OBJECTS} ${LIBS}
+
+libdriclient:
+	cd ${DRIDIR}/src; ${MAKE}
+
+clean:
+	cd ${DRIDIR}/src; ${MAKE} clean
+	rm -rf ${OBJECTS} ${TARGET}
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_bo.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_bo.c
new file mode 120000
index 00000000000..1005282dd4d
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_bo.c
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_bo.c
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_channel.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_channel.c
new file mode 120000
index 00000000000..5af82029508
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_channel.c
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_channel.c
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_context.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_context.c
new file mode 100644
index 00000000000..06a61fcda3e
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_context.c
@@ -0,0 +1,370 @@
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/u_memory.h"
+
+#include "nouveau_context.h"
+#include "nouveau_dri.h"
+#include "nouveau_local.h"
+#include "nouveau_screen.h"
+#include "nouveau_winsys_pipe.h"
+
+/*
+#ifdef DEBUG
+static const struct dri_debug_control debug_control[] = {
+	{ "bo", DEBUG_BO },
+	{ NULL, 0 }
+};
+int __nouveau_debug = 0;
+#endif
+*/
+
+/*
+ * TODO: Re-examine dri_screen, dri_context, nouveau_screen, nouveau_context
+ * relationships, seems like there is a lot of room for simplification there.
+ */
+
+static void
+nouveau_channel_context_destroy(struct nouveau_channel_context *nvc)
+{
+	nouveau_grobj_free(&nvc->NvCtxSurf2D);
+	nouveau_grobj_free(&nvc->NvImageBlit);
+	nouveau_grobj_free(&nvc->NvGdiRect);
+	nouveau_grobj_free(&nvc->NvM2MF);
+	nouveau_grobj_free(&nvc->Nv2D);
+	nouveau_grobj_free(&nvc->NvSwzSurf);
+	nouveau_grobj_free(&nvc->NvSIFM);
+
+	nouveau_notifier_free(&nvc->sync_notifier);
+
+	nouveau_channel_free(&nvc->channel);
+
+	FREE(nvc);
+}
+
+static struct nouveau_channel_context *
+nouveau_channel_context_create(struct nouveau_device *dev)
+{
+	struct nouveau_channel_context *nvc;
+	int ret;
+
+	nvc = CALLOC_STRUCT(nouveau_channel_context);
+	if (!nvc)
+		return NULL;
+
+	if ((ret = nouveau_channel_alloc(dev, 0x8003d001, 0x8003d002,
+					 &nvc->channel))) {
+		NOUVEAU_ERR("Error creating GPU channel: %d\n", ret);
+		nouveau_channel_context_destroy(nvc);
+		return NULL;
+	}
+
+	nvc->next_handle = 0x80000000;
+
+	if ((ret = nouveau_notifier_alloc(nvc->channel, nvc->next_handle++, 1,
+					  &nvc->sync_notifier))) {
+		NOUVEAU_ERR("Error creating channel sync notifier: %d\n", ret);
+		nouveau_channel_context_destroy(nvc);
+		return NULL;
+	}
+
+	switch (dev->chipset & 0xf0) {
+	case 0x50:
+	case 0x80:
+	case 0x90:
+		ret = nouveau_surface_channel_create_nv50(nvc);
+		break;
+	default:
+		ret = nouveau_surface_channel_create_nv04(nvc);
+		break;
+	}
+
+	if (ret) {
+		NOUVEAU_ERR("Error initialising surface objects: %d\n", ret);
+		nouveau_channel_context_destroy(nvc);
+		return NULL;
+	}
+
+	return nvc;
+}
+
+int
+nouveau_context_create(dri_context_t *dri_context)
+{
+	dri_screen_t			*dri_screen = dri_context->dri_screen;
+	struct nouveau_screen 		*nv_screen = dri_screen->private;
+	struct nouveau_context		*nv = CALLOC_STRUCT(nouveau_context);
+	struct pipe_context		*pipe = NULL;
+	struct nouveau_channel_context	*nvc = NULL;
+	struct nouveau_device		*dev = nv_screen->device;
+	int				i;
+
+	switch (dev->chipset & 0xf0) {
+	case 0x10:
+	case 0x20:
+		/* NV10 */
+	case 0x30:
+		/* NV30 */
+	case 0x40:
+	case 0x60:
+		/* NV40 */
+	case 0x50:
+	case 0x80:
+	case 0x90:
+		/* G80 */
+		break;
+	default:
+		NOUVEAU_ERR("Unsupported chipset: NV%02x\n", dev->chipset);
+		return 1;
+	}
+
+	dri_context->private = (void*)nv;
+	nv->dri_context = dri_context;
+	nv->nv_screen  = nv_screen;
+
+	{
+		struct nouveau_device_priv *nvdev = nouveau_device(dev);
+
+		nvdev->ctx  = dri_context->drm_context;
+		nvdev->lock = (drmLock*)&dri_screen->sarea->lock;
+	}
+
+	/*
+	driParseConfigFiles(&nv->dri_option_cache, &nv_screen->option_cache,
+			    nv->dri_screen->myNum, "nouveau");
+#ifdef DEBUG
+	__nouveau_debug = driParseDebugString(getenv("NOUVEAU_DEBUG"),
+					      debug_control);
+#endif
+	*/
+
+	/*XXX: Hack up a fake region and buffer object for front buffer.
+	 *     This will go away with TTM, replaced with a simple reference
+	 *     of the front buffer handle passed to us by the DDX.
+	 */
+	{
+		struct pipe_surface *fb_surf;
+		struct nouveau_pipe_buffer *fb_buf;
+		struct nouveau_bo_priv *fb_bo;
+
+		fb_bo = calloc(1, sizeof(struct nouveau_bo_priv));
+		fb_bo->drm.offset = nv_screen->front_offset;
+		fb_bo->drm.flags = NOUVEAU_MEM_FB;
+		fb_bo->drm.size = nv_screen->front_pitch *
+				  nv_screen->front_height;
+		fb_bo->refcount = 1;
+		fb_bo->base.flags = NOUVEAU_BO_PIN | NOUVEAU_BO_VRAM;
+		fb_bo->base.offset = fb_bo->drm.offset;
+		fb_bo->base.handle = (unsigned long)fb_bo;
+		fb_bo->base.size = fb_bo->drm.size;
+		fb_bo->base.device = nv_screen->device;
+
+		fb_buf = calloc(1, sizeof(struct nouveau_pipe_buffer));
+		fb_buf->bo = &fb_bo->base;
+
+		fb_surf = calloc(1, sizeof(struct pipe_surface));
+		if (nv_screen->front_cpp == 2)
+			fb_surf->format = PIPE_FORMAT_R5G6B5_UNORM;
+		else
+			fb_surf->format = PIPE_FORMAT_A8R8G8B8_UNORM;
+		pf_get_block(fb_surf->format, &fb_surf->block);
+		fb_surf->width = nv_screen->front_pitch / nv_screen->front_cpp;
+		fb_surf->height = nv_screen->front_height;
+		fb_surf->stride = fb_surf->width * fb_surf->block.size;
+		fb_surf->refcount = 1;
+		fb_surf->buffer = &fb_buf->base;
+
+		nv->frontbuffer = fb_surf;
+	}
+
+	nvc = nv_screen->nvc;
+
+	if (!nvc) {
+		nvc = nouveau_channel_context_create(dev);
+		if (!nvc) {
+			NOUVEAU_ERR("Failed initialising GPU context\n");
+			return 1;
+		}
+		nv_screen->nvc = nvc;
+	}
+
+	nvc->refcount++;
+	nv->nvc = nvc;
+
+	/* Find a free slot for a pipe context, allocate a new one if needed */
+	nv->pctx_id = -1;
+	for (i = 0; i < nvc->nr_pctx; i++) {
+		if (nvc->pctx[i] == NULL) {
+			nv->pctx_id = i;
+			break;
+		}
+	}
+
+	if (nv->pctx_id < 0) {
+		nv->pctx_id = nvc->nr_pctx++;
+		nvc->pctx =
+			realloc(nvc->pctx,
+				sizeof(struct pipe_context *) * nvc->nr_pctx);
+	}
+
+	/* Create pipe */
+	switch (dev->chipset & 0xf0) {
+	case 0x50:
+	case 0x80:
+	case 0x90:
+		if (nouveau_surface_init_nv50(nv))
+			return 1;
+		break;
+	default:
+		if (nouveau_surface_init_nv04(nv))
+			return 1;
+		break;
+	}
+
+	if (!getenv("NOUVEAU_FORCE_SOFTPIPE")) {
+		struct pipe_screen *pscreen;
+
+		pipe = nouveau_pipe_create(nv);
+		if (!pipe)
+			NOUVEAU_ERR("Couldn't create hw pipe\n");
+		pscreen = nvc->pscreen;
+
+		nv->cap.hw_vertex_buffer =
+			pscreen->get_param(pscreen, NOUVEAU_CAP_HW_VTXBUF);
+		nv->cap.hw_index_buffer =
+			pscreen->get_param(pscreen, NOUVEAU_CAP_HW_IDXBUF);
+	}
+
+	/* XXX: nouveau_winsys_softpipe needs a mesa header removed before we can compile it. */
+	/*
+	if (!pipe) {
+		NOUVEAU_MSG("Using softpipe\n");
+		pipe = nouveau_create_softpipe(nv);
+		if (!pipe) {
+			NOUVEAU_ERR("Error creating pipe, bailing\n");
+			return 1;
+		}
+	}
+	*/
+	if (!pipe) {
+		NOUVEAU_ERR("Error creating pipe, bailing\n");
+		return 1;
+	}
+
+	pipe->priv = nv;
+
+	return 0;
+}
+
+void
+nouveau_context_destroy(dri_context_t *dri_context)
+{
+	struct nouveau_context *nv = dri_context->private;
+	struct nouveau_channel_context *nvc = nv->nvc;
+
+	assert(nv);
+
+	if (nv->pctx_id >= 0) {
+		nvc->pctx[nv->pctx_id] = NULL;
+		if (--nvc->refcount <= 0) {
+			nouveau_channel_context_destroy(nvc);
+			nv->nv_screen->nvc = NULL;
+		}
+	}
+
+	free(nv);
+}
+
+int
+nouveau_context_bind(struct nouveau_context *nv, dri_drawable_t *dri_drawable)
+{
+	assert(nv);
+	assert(dri_drawable);
+
+	if (nv->dri_drawable != dri_drawable)
+	{
+		nv->dri_drawable = dri_drawable;
+		dri_drawable->private = nv;
+	}
+
+	return 0;
+}
+
+int
+nouveau_context_unbind(struct nouveau_context *nv)
+{
+	assert(nv);
+
+	nv->dri_drawable = NULL;
+
+	return 0;
+}
+
+/* Show starts here */
+
+int bind_pipe_drawable(struct pipe_context *pipe, Drawable drawable)
+{
+	struct nouveau_context	*nv;
+	dri_drawable_t		*dri_drawable;
+
+	nv = pipe->priv;
+
+	driCreateDrawable(nv->nv_screen->dri_screen, drawable, &dri_drawable);
+
+	nouveau_context_bind(nv, dri_drawable);
+
+	return 0;
+}
+
+int unbind_pipe_drawable(struct pipe_context *pipe)
+{
+	nouveau_context_unbind(pipe->priv);
+
+	return 0;
+}
+
+struct pipe_context* create_pipe_context(Display *display, int screen)
+{
+	dri_screen_t		*dri_screen;
+	dri_framebuffer_t	dri_framebuf;
+	dri_context_t		*dri_context;
+	struct nouveau_context	*nv;
+
+	driCreateScreen(display, screen, &dri_screen, &dri_framebuf);
+	driCreateContext(dri_screen, XDefaultVisual(display, screen), &dri_context);
+
+	nouveau_screen_create(dri_screen, &dri_framebuf);
+	nouveau_context_create(dri_context);
+
+	nv = dri_context->private;
+
+	return nv->nvc->pctx[nv->pctx_id];
+}
+
+int destroy_pipe_context(struct pipe_context *pipe)
+{
+	struct pipe_screen	*screen;
+	struct pipe_winsys	*winsys;
+	struct nouveau_context	*nv;
+	dri_screen_t		*dri_screen;
+	dri_context_t		*dri_context;
+
+	assert(pipe);
+
+	screen = pipe->screen;
+	winsys = pipe->winsys;
+	nv = pipe->priv;
+	dri_context = nv->dri_context;
+	dri_screen = dri_context->dri_screen;
+
+	pipe->destroy(pipe);
+	screen->destroy(screen);
+	free(winsys);
+
+	nouveau_context_destroy(dri_context);
+	nouveau_screen_destroy(dri_screen);
+	driDestroyContext(dri_context);
+	driDestroyScreen(dri_screen);
+
+	return 0;
+}
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_context.h b/src/gallium/winsys/g3dvl/nouveau/nouveau_context.h
new file mode 100644
index 00000000000..395a3ab7903
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_context.h
@@ -0,0 +1,105 @@
+#ifndef __NOUVEAU_CONTEXT_H__
+#define __NOUVEAU_CONTEXT_H__
+
+/*#include "xmlconfig.h"*/
+
+#include <driclient.h>
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau_drmif.h"
+#include "nouveau_dma.h"
+
+struct nouveau_channel_context {
+	struct pipe_screen *pscreen;
+	int refcount;
+
+	unsigned cur_pctx;
+	unsigned nr_pctx;
+	struct pipe_context **pctx;
+
+	struct nouveau_channel  *channel;
+
+	struct nouveau_notifier *sync_notifier;
+
+	/* Common */
+	struct nouveau_grobj    *NvM2MF;
+	/* NV04-NV40 */
+	struct nouveau_grobj    *NvCtxSurf2D;
+	struct nouveau_grobj	*NvSwzSurf;
+	struct nouveau_grobj    *NvImageBlit;
+	struct nouveau_grobj    *NvGdiRect;
+	struct nouveau_grobj	*NvSIFM;
+	/* G80 */
+	struct nouveau_grobj    *Nv2D;
+
+	uint32_t                 next_handle;
+	uint32_t                 next_subchannel;
+	uint32_t                 next_sequence;
+};
+
+struct nouveau_context {
+	/* DRI stuff */
+	dri_context_t		*dri_context;
+	dri_drawable_t		*dri_drawable;
+	unsigned int		last_stamp;
+	/*driOptionCache	dri_option_cache;*/
+	drm_context_t		drm_context;
+	drmLock			drm_lock;
+	int			locked;
+	struct nouveau_screen	*nv_screen;
+	struct pipe_surface	*frontbuffer;
+
+	struct {
+		int hw_vertex_buffer;
+		int hw_index_buffer;
+	} cap;
+
+	/* Hardware context */
+	struct nouveau_channel_context	*nvc;
+	int				pctx_id;
+
+	/* pipe_surface accel */
+	struct pipe_surface		*surf_src, *surf_dst;
+	unsigned			surf_src_offset, surf_dst_offset;
+	
+	int  (*surface_copy_prep)(struct nouveau_context *,
+				  struct pipe_surface *dst,
+				  struct pipe_surface *src);
+	void (*surface_copy)(struct nouveau_context *, unsigned dx, unsigned dy,
+			     unsigned sx, unsigned sy, unsigned w, unsigned h);
+	void (*surface_copy_done)(struct nouveau_context *);
+	int (*surface_fill)(struct nouveau_context *, struct pipe_surface *,
+			    unsigned, unsigned, unsigned, unsigned, unsigned);
+};
+
+extern int nouveau_context_create(dri_context_t *);
+extern void nouveau_context_destroy(dri_context_t *);
+extern int nouveau_context_bind(struct nouveau_context *, dri_drawable_t *);
+extern int nouveau_context_unbind(struct nouveau_context *);
+
+#ifdef DEBUG
+extern int __nouveau_debug;
+
+#define DEBUG_BO (1 << 0)
+
+#define DBG(flag, ...) do {                   \
+	if (__nouveau_debug & (DEBUG_##flag)) \
+		NOUVEAU_ERR(__VA_ARGS__);     \
+} while(0)
+#else
+#define DBG(flag, ...)
+#endif
+
+extern void LOCK_HARDWARE(struct nouveau_context *);
+extern void UNLOCK_HARDWARE(struct nouveau_context *);
+
+extern int
+nouveau_surface_channel_create_nv04(struct nouveau_channel_context *);
+extern int
+nouveau_surface_channel_create_nv50(struct nouveau_channel_context *);
+extern int nouveau_surface_init_nv04(struct nouveau_context *);
+extern int nouveau_surface_init_nv50(struct nouveau_context *);
+
+extern uint32_t *nouveau_pipe_dma_beginp(struct nouveau_grobj *, int, int);
+extern void nouveau_pipe_dma_kickoff(struct nouveau_channel *);
+
+#endif
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_device.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_device.c
new file mode 120000
index 00000000000..63f1fa040c7
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_device.c
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_device.c
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_dma.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_dma.c
new file mode 120000
index 00000000000..cd0d32e5373
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_dma.c
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_dma.c
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_dma.h b/src/gallium/winsys/g3dvl/nouveau/nouveau_dma.h
new file mode 120000
index 00000000000..e6c7d4bc965
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_dma.h
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_dma.h
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_dri.h b/src/gallium/winsys/g3dvl/nouveau/nouveau_dri.h
new file mode 120000
index 00000000000..c8f9dbdc3a8
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_dri.h
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_dri.h
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_drmif.h b/src/gallium/winsys/g3dvl/nouveau/nouveau_drmif.h
new file mode 120000
index 00000000000..27082c9d34c
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_drmif.h
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_drmif.h
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_fence.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_fence.c
new file mode 120000
index 00000000000..51a50527e6d
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_fence.c
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_fence.c
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_grobj.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_grobj.c
new file mode 120000
index 00000000000..db17c72e6d7
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_grobj.c
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_grobj.c
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_local.h b/src/gallium/winsys/g3dvl/nouveau/nouveau_local.h
new file mode 120000
index 00000000000..4e9d3042cb5
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_local.h
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_local.h
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_lock.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_lock.c
new file mode 100644
index 00000000000..375634bd059
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_lock.c
@@ -0,0 +1,92 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include <pthread.h>
+#include <driclient.h>
+#include "nouveau_context.h"
+#include "nouveau_screen.h"
+
+static pthread_mutex_t lockMutex = PTHREAD_MUTEX_INITIALIZER;
+
+static void
+nouveau_contended_lock(struct nouveau_context *nv, unsigned int flags)
+{
+	dri_drawable_t			*dri_drawable = nv->dri_drawable;
+	dri_screen_t			*dri_screen = nv->dri_context->dri_screen;
+	struct nouveau_screen		*nv_screen = nv->nv_screen;
+	struct nouveau_device		*dev = nv_screen->device;
+	struct nouveau_device_priv	*nvdev = nouveau_device(dev);
+
+	drmGetLock(nvdev->fd, nvdev->ctx, flags);
+
+	/* If the window moved, may need to set a new cliprect now.
+	 *
+	 * NOTE: This releases and regains the hw lock, so all state
+	 * checking must be done *after* this call:
+	 */
+	if (dri_drawable)
+		DRI_VALIDATE_DRAWABLE_INFO(dri_screen, dri_drawable);
+}
+
+/* Lock the hardware and validate our state.
+ */
+void
+LOCK_HARDWARE(struct nouveau_context *nv)
+{
+	struct nouveau_screen		*nv_screen = nv->nv_screen;
+	struct nouveau_device		*dev = nv_screen->device;
+	struct nouveau_device_priv	*nvdev = nouveau_device(dev);
+	char				__ret=0;
+
+	pthread_mutex_lock(&lockMutex);
+	assert(!nv->locked);
+	
+	DRM_CAS(nvdev->lock, nvdev->ctx,
+		(DRM_LOCK_HELD | nvdev->ctx), __ret);
+	
+	if (__ret)
+		nouveau_contended_lock(nv, 0);
+	nv->locked = 1;
+}
+
+
+/* Unlock the hardware using the global current context 
+ */
+void
+UNLOCK_HARDWARE(struct nouveau_context *nv)
+{
+	struct nouveau_screen		*nv_screen = nv->nv_screen;
+	struct nouveau_device		*dev = nv_screen->device;
+	struct nouveau_device_priv	*nvdev = nouveau_device(dev);
+
+	assert(nv->locked);
+	nv->locked = 0;
+
+	DRM_UNLOCK(nvdev->fd, nvdev->lock, nvdev->ctx);
+
+	pthread_mutex_unlock(&lockMutex);
+} 
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_notifier.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_notifier.c
new file mode 120000
index 00000000000..703bc3ce4ab
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_notifier.c
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_notifier.c
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_pushbuf.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_pushbuf.c
new file mode 120000
index 00000000000..4ac137cadad
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_pushbuf.c
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_pushbuf.c
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_resource.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_resource.c
new file mode 120000
index 00000000000..2241af328ff
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_resource.c
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_resource.c
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_screen.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_screen.c
new file mode 100644
index 00000000000..f80d00050c8
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_screen.c
@@ -0,0 +1,91 @@
+#include "pipe/p_context.h"
+#include "util/u_memory.h"
+#include "nouveau_context.h"
+#include <nouveau_drm.h>
+#include "nouveau_dri.h"
+#include "nouveau_local.h"
+#include "nouveau_screen.h"
+#include "nouveau_swapbuffers.h"
+
+#if NOUVEAU_DRM_HEADER_PATCHLEVEL != 11
+#error nouveau_drm.h version does not match expected version
+#endif
+
+/*
+PUBLIC const char __driConfigOptions[] =
+DRI_CONF_BEGIN
+DRI_CONF_END;
+static const GLuint __driNConfigOptions = 0;
+*/
+
+int nouveau_check_dri_drm_ddx(dri_version_t *dri, dri_version_t *drm, dri_version_t *ddx)
+{
+	static const dri_version_t ddx_expected = {0, 0, NOUVEAU_DRM_HEADER_PATCHLEVEL};
+	static const dri_version_t dri_expected = {4, 0, 0};
+	static const dri_version_t drm_expected = {0, 0, NOUVEAU_DRM_HEADER_PATCHLEVEL};
+
+	assert(dri);
+	assert(drm);
+	assert(ddx);
+
+	if (dri->major != dri_expected.major || dri->minor < dri_expected.minor)
+	{
+		NOUVEAU_ERR("Unexpected DRI version.\n");
+		return 1;
+	}
+	if (drm->major != drm_expected.major || drm->minor < drm_expected.minor)
+	{
+		NOUVEAU_ERR("Unexpected DRM version.\n");
+		return 1;
+	}
+	if (ddx->major != ddx_expected.major || ddx->minor < ddx_expected.minor)
+	{
+		NOUVEAU_ERR("Unexpected DDX version.\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+int
+nouveau_screen_create(dri_screen_t *dri_screen, dri_framebuffer_t *dri_framebuf)
+{
+	struct nouveau_dri	*nv_dri = dri_framebuf->private;
+	struct nouveau_screen	*nv_screen;
+	int			ret;
+
+	if (nouveau_check_dri_drm_ddx(&dri_screen->dri, &dri_screen->drm, &dri_screen->ddx))
+		return 1;
+
+	nv_screen = CALLOC_STRUCT(nouveau_screen);
+	if (!nv_screen)
+		return 1;
+	nv_screen->dri_screen = dri_screen;
+	dri_screen->private = (void*)nv_screen;
+
+	/*
+	driParseOptionInfo(&nv_screen->option_cache,
+			   __driConfigOptions, __driNConfigOptions);
+	*/
+
+	if ((ret = nouveau_device_open_existing(&nv_screen->device, 0,
+						dri_screen->fd, 0))) {
+		NOUVEAU_ERR("Failed opening nouveau device: %d.\n", ret);
+		return 1;
+	}
+
+	nv_screen->front_offset = nv_dri->front_offset;
+	nv_screen->front_pitch  = nv_dri->front_pitch * (nv_dri->bpp / 8);
+	nv_screen->front_cpp = nv_dri->bpp / 8;
+	nv_screen->front_height = nv_dri->height;
+
+	return 0;
+}
+
+void
+nouveau_screen_destroy(dri_screen_t *dri_screen)
+{
+	struct nouveau_screen *nv_screen = dri_screen->private;
+
+	FREE(nv_screen);
+}
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_screen.h b/src/gallium/winsys/g3dvl/nouveau/nouveau_screen.h
new file mode 100644
index 00000000000..8a58bb75564
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_screen.h
@@ -0,0 +1,24 @@
+#ifndef __NOUVEAU_SCREEN_H__
+#define __NOUVEAU_SCREEN_H__
+
+/* TODO: Investigate using DRI options for interesting things */
+/*#include "xmlconfig.h"*/
+
+struct nouveau_screen {
+	dri_screen_t			*dri_screen;
+	struct nouveau_device		*device;
+	struct nouveau_channel_context	*nvc;
+
+	uint32_t			front_offset;
+	uint32_t			front_pitch;
+	uint32_t			front_cpp;
+	uint32_t			front_height;
+	
+	/*driOptionCache		option_cache;*/
+};
+
+int nouveau_screen_create(dri_screen_t *dri_screen, dri_framebuffer_t *dri_framebuf);
+void nouveau_screen_destroy(dri_screen_t *dri_screen);
+
+#endif
+
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_swapbuffers.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_swapbuffers.c
new file mode 100644
index 00000000000..7916c806151
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_swapbuffers.c
@@ -0,0 +1,64 @@
+#include "pipe/p_context.h"
+#include "nouveau_context.h"
+#include "nouveau_local.h"
+#include "nouveau_screen.h"
+#include "nouveau_swapbuffers.h"
+
+void
+nouveau_copy_buffer(dri_drawable_t *dri_drawable, struct pipe_surface *surf,
+		    const drm_clip_rect_t *rect)
+{
+	struct nouveau_context	*nv = dri_drawable->private;
+	drm_clip_rect_t		*pbox;
+	int			nbox, i;
+
+	LOCK_HARDWARE(nv);
+	if (!dri_drawable->num_cliprects) {
+		UNLOCK_HARDWARE(nv);
+		return;
+	}
+	pbox = dri_drawable->cliprects;
+	nbox = dri_drawable->num_cliprects;
+
+	nv->surface_copy_prep(nv, nv->frontbuffer, surf);
+	for (i = 0; i < nbox; i++, pbox++) {
+		int sx, sy, dx, dy, w, h;
+
+		sx = pbox->x1 - dri_drawable->x;
+		sy = pbox->y1 - dri_drawable->y;
+		dx = pbox->x1;
+		dy = pbox->y1;
+		w  = pbox->x2 - pbox->x1;
+		h  = pbox->y2 - pbox->y1;
+
+		nv->surface_copy(nv, dx, dy, sx, sy, w, h);
+	}
+
+	FIRE_RING(nv->nvc->channel);
+	UNLOCK_HARDWARE(nv);
+
+	//if (nv->last_stamp != dri_drawable->last_sarea_stamp)
+		//nv->last_stamp = dri_drawable->last_sarea_stamp;
+}
+
+void
+nouveau_copy_sub_buffer(dri_drawable_t *dri_drawable, struct pipe_surface *surf, int x, int y, int w, int h)
+{
+	if (surf) {
+		drm_clip_rect_t rect;
+		rect.x1 = x;
+		rect.y1 = y;
+		rect.x2 = x + w;
+		rect.y2 = y + h;
+
+		nouveau_copy_buffer(dri_drawable, surf, &rect);
+	}
+}
+
+void
+nouveau_swap_buffers(dri_drawable_t *dri_drawable, struct pipe_surface *surf)
+{
+	if (surf)
+		nouveau_copy_buffer(dri_drawable, surf, NULL);
+}
+
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_swapbuffers.h b/src/gallium/winsys/g3dvl/nouveau/nouveau_swapbuffers.h
new file mode 100644
index 00000000000..35e934adba8
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_swapbuffers.h
@@ -0,0 +1,10 @@
+#ifndef __NOUVEAU_SWAPBUFFERS_H__
+#define __NOUVEAU_SWAPBUFFERS_H__
+
+extern void nouveau_copy_buffer(dri_drawable_t *, struct pipe_surface *,
+				const drm_clip_rect_t *);
+extern void nouveau_copy_sub_buffer(dri_drawable_t *, struct pipe_surface *,
+				    int x, int y, int w, int h);
+extern void nouveau_swap_buffers(dri_drawable_t *, struct pipe_surface *);
+
+#endif
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_winsys.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_winsys.c
new file mode 120000
index 00000000000..ce4052d5572
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_winsys.c
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_winsys.c
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_winsys_pipe.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_winsys_pipe.c
new file mode 100644
index 00000000000..4f6ac9cad0b
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_winsys_pipe.c
@@ -0,0 +1,260 @@
+#include "pipe/p_winsys.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "util/u_memory.h"
+
+#include "nouveau_context.h"
+#include "nouveau_local.h"
+#include "nouveau_screen.h"
+#include "nouveau_swapbuffers.h"
+#include "nouveau_winsys_pipe.h"
+
+static void
+nouveau_flush_frontbuffer(struct pipe_winsys *pws, struct pipe_surface *surf,
+			  void *context_private)
+{
+	struct nouveau_context *nv = context_private;
+	dri_drawable_t *dri_drawable = nv->dri_drawable;
+
+	nouveau_copy_buffer(dri_drawable, surf, NULL);
+}
+
+static const char *
+nouveau_get_name(struct pipe_winsys *pws)
+{
+	return "Nouveau/DRI";
+}
+
+static struct pipe_surface *
+nouveau_surface_alloc(struct pipe_winsys *ws)
+{
+	struct pipe_surface *surf;
+
+	surf = CALLOC_STRUCT(pipe_surface);
+	if (!surf)
+		return NULL;
+
+	surf->refcount = 1;
+	surf->winsys = ws;
+	return surf;
+}
+
+/* Borrowed from Mesa's xm_winsys */
+static unsigned int
+round_up(unsigned n, unsigned multiple)
+{
+   return (n + multiple - 1) & ~(multiple - 1);
+}
+
+static int
+nouveau_surface_alloc_storage
+(
+	struct pipe_winsys *pws,
+	struct pipe_surface *surface,
+	unsigned width,
+	unsigned height,
+	enum pipe_format format,
+	unsigned flags,
+	unsigned tex_usage
+)
+{
+	const unsigned int ALIGNMENT = 256;
+
+	assert(pws);
+	assert(surface);
+
+	surface->width = width;
+	surface->height = height;
+	surface->format = format;
+	pf_get_block(format, &surface->block);
+	surface->nblocksx = pf_get_nblocksx(&surface->block, width);
+	surface->nblocksy = pf_get_nblocksy(&surface->block, height);
+	surface->stride = round_up(surface->nblocksx * surface->block.size, ALIGNMENT);
+	surface->usage = flags;
+	surface->buffer = pws->buffer_create(pws, ALIGNMENT, PIPE_BUFFER_USAGE_PIXEL, surface->stride * surface->nblocksy);
+
+	return 0;
+}
+
+static void
+nouveau_surface_release(struct pipe_winsys *ws, struct pipe_surface **s)
+{
+	struct pipe_surface *surf = *s;
+
+	*s = NULL;
+	if (--surf->refcount <= 0) {
+		if (surf->buffer)
+			winsys_buffer_reference(ws, &surf->buffer, NULL);
+		free(surf);
+	}
+}
+
+static struct pipe_buffer *
+nouveau_pipe_bo_create(struct pipe_winsys *pws, unsigned alignment,
+		       unsigned usage, unsigned size)
+{
+	struct nouveau_pipe_winsys *nvpws = (struct nouveau_pipe_winsys *)pws;
+	struct nouveau_context *nv = nvpws->nv;
+	struct nouveau_device *dev = nv->nv_screen->device;
+	struct nouveau_pipe_buffer *nvbuf;
+	uint32_t flags;
+
+	nvbuf = calloc(1, sizeof(*nvbuf));
+	if (!nvbuf)
+		return NULL;
+	nvbuf->base.refcount = 1;
+	nvbuf->base.alignment = alignment;
+	nvbuf->base.usage = usage;
+	nvbuf->base.size = size;
+
+	flags = NOUVEAU_BO_LOCAL;
+
+	if (usage & PIPE_BUFFER_USAGE_PIXEL) {
+		if (usage & NOUVEAU_BUFFER_USAGE_TEXTURE)
+			flags |= NOUVEAU_BO_GART;
+		flags |= NOUVEAU_BO_VRAM;
+	}
+
+	if (usage & PIPE_BUFFER_USAGE_VERTEX) {
+		if (nv->cap.hw_vertex_buffer)
+			flags |= NOUVEAU_BO_GART;
+	}
+
+	if (usage & PIPE_BUFFER_USAGE_INDEX) {
+		if (nv->cap.hw_index_buffer)
+			flags |= NOUVEAU_BO_GART;
+	}
+
+	if (nouveau_bo_new(dev, flags, alignment, size, &nvbuf->bo)) {
+		free(nvbuf);
+		return NULL;
+	}
+
+	return &nvbuf->base;
+}
+
+static struct pipe_buffer *
+nouveau_pipe_bo_user_create(struct pipe_winsys *pws, void *ptr, unsigned bytes)
+{
+	struct nouveau_pipe_winsys *nvpws = (struct nouveau_pipe_winsys *)pws;
+	struct nouveau_device *dev = nvpws->nv->nv_screen->device;
+	struct nouveau_pipe_buffer *nvbuf;
+
+	nvbuf = calloc(1, sizeof(*nvbuf));
+	if (!nvbuf)
+		return NULL;
+	nvbuf->base.refcount = 1;
+	nvbuf->base.size = bytes;
+
+	if (nouveau_bo_user(dev, ptr, bytes, &nvbuf->bo)) {
+		free(nvbuf);
+		return NULL;
+	}
+
+	return &nvbuf->base;
+}
+
+static void
+nouveau_pipe_bo_del(struct pipe_winsys *ws, struct pipe_buffer *buf)
+{
+	struct nouveau_pipe_buffer *nvbuf = nouveau_buffer(buf);
+
+	nouveau_bo_del(&nvbuf->bo);
+	free(nvbuf);
+}
+
+static void *
+nouveau_pipe_bo_map(struct pipe_winsys *pws, struct pipe_buffer *buf,
+		    unsigned flags)
+{
+	struct nouveau_pipe_buffer *nvbuf = nouveau_buffer(buf);
+	uint32_t map_flags = 0;
+
+	if (flags & PIPE_BUFFER_USAGE_CPU_READ)
+		map_flags |= NOUVEAU_BO_RD;
+	if (flags & PIPE_BUFFER_USAGE_CPU_WRITE)
+		map_flags |= NOUVEAU_BO_WR;
+
+	if (nouveau_bo_map(nvbuf->bo, map_flags))
+		return NULL;
+	return nvbuf->bo->map;
+}
+
+static void
+nouveau_pipe_bo_unmap(struct pipe_winsys *pws, struct pipe_buffer *buf)
+{
+	struct nouveau_pipe_buffer *nvbuf = nouveau_buffer(buf);
+
+	nouveau_bo_unmap(nvbuf->bo);
+}
+
+static INLINE struct nouveau_fence *
+nouveau_pipe_fence(struct pipe_fence_handle *pfence)
+{
+	return (struct nouveau_fence *)pfence;
+}
+
+static void
+nouveau_pipe_fence_reference(struct pipe_winsys *ws,
+			     struct pipe_fence_handle **ptr,
+			     struct pipe_fence_handle *pfence)
+{
+	nouveau_fence_ref((void *)pfence, (void *)ptr);
+}
+
+static int
+nouveau_pipe_fence_signalled(struct pipe_winsys *ws,
+			     struct pipe_fence_handle *pfence, unsigned flag)
+{
+	struct nouveau_pipe_winsys *nvpws = (struct nouveau_pipe_winsys *)ws;
+	struct nouveau_fence *fence = nouveau_pipe_fence(pfence);
+
+	if (nouveau_fence(fence)->signalled == 0)
+		nouveau_fence_flush(nvpws->nv->nvc->channel);
+
+	return !nouveau_fence(fence)->signalled;
+}
+
+static int
+nouveau_pipe_fence_finish(struct pipe_winsys *ws,
+			  struct pipe_fence_handle *pfence, unsigned flag)
+{
+	struct nouveau_fence *fence = nouveau_pipe_fence(pfence);
+	struct nouveau_fence *ref = NULL;
+
+	nouveau_fence_ref(fence, &ref);
+	return nouveau_fence_wait(&ref);
+}
+
+struct pipe_winsys *
+nouveau_create_pipe_winsys(struct nouveau_context *nv)
+{
+	struct nouveau_pipe_winsys *nvpws;
+	struct pipe_winsys *pws;
+
+	nvpws = CALLOC_STRUCT(nouveau_pipe_winsys);
+	if (!nvpws)
+		return NULL;
+	nvpws->nv = nv;
+	pws = &nvpws->pws;
+
+	pws->flush_frontbuffer = nouveau_flush_frontbuffer;
+
+	pws->surface_alloc = nouveau_surface_alloc;
+	pws->surface_alloc_storage = nouveau_surface_alloc_storage;
+	pws->surface_release = nouveau_surface_release;
+
+	pws->buffer_create = nouveau_pipe_bo_create;
+	pws->buffer_destroy = nouveau_pipe_bo_del;
+	pws->user_buffer_create = nouveau_pipe_bo_user_create;
+	pws->buffer_map = nouveau_pipe_bo_map;
+	pws->buffer_unmap = nouveau_pipe_bo_unmap;
+
+	pws->fence_reference = nouveau_pipe_fence_reference;
+	pws->fence_signalled = nouveau_pipe_fence_signalled;
+	pws->fence_finish = nouveau_pipe_fence_finish;
+
+	pws->get_name = nouveau_get_name;
+
+	return &nvpws->pws;
+}
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_winsys_pipe.h b/src/gallium/winsys/g3dvl/nouveau/nouveau_winsys_pipe.h
new file mode 120000
index 00000000000..9d9460883e0
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_winsys_pipe.h
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_winsys_pipe.h
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_winsys_softpipe.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_winsys_softpipe.c
new file mode 120000
index 00000000000..ec613ecbf65
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_winsys_softpipe.c
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_winsys_softpipe.c
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nv04_surface.c b/src/gallium/winsys/g3dvl/nouveau/nv04_surface.c
new file mode 120000
index 00000000000..4455d8f9243
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nv04_surface.c
@@ -0,0 +1 @@
+../../drm/nouveau/nv04_surface.c
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nv50_surface.c b/src/gallium/winsys/g3dvl/nouveau/nv50_surface.c
new file mode 120000
index 00000000000..19f102001e9
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nv50_surface.c
@@ -0,0 +1 @@
+../../drm/nouveau/nv50_surface.c
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/vl_winsys.h b/src/gallium/winsys/g3dvl/vl_winsys.h
new file mode 100644
index 00000000000..c83db28dd98
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/vl_winsys.h
@@ -0,0 +1,14 @@
+#ifndef vl_winsys_h
+#define vl_winsys_h
+
+#include <X11/Xlib.h>
+
+struct pipe_context;
+
+struct pipe_context* create_pipe_context(Display *display, int screen);
+int destroy_pipe_context(struct pipe_context *pipe);
+int bind_pipe_drawable(struct pipe_context *pipe, Drawable drawable);
+int unbind_pipe_drawable(struct pipe_context *pipe);
+
+#endif
+
diff --git a/src/gallium/winsys/g3dvl/xsp_winsys.c b/src/gallium/winsys/g3dvl/xsp_winsys.c
new file mode 100644
index 00000000000..68be2c2ea3a
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/xsp_winsys.c
@@ -0,0 +1,336 @@
+#include "vl_winsys.h"
+#include <X11/Xutil.h>
+#include <pipe/p_winsys.h>
+#include <pipe/p_state.h>
+#include <pipe/p_inlines.h>
+#include <util/u_memory.h>
+#include <softpipe/sp_winsys.h>
+
+/* pipe_winsys implementation */
+
+struct xsp_pipe_winsys
+{
+	struct pipe_winsys	base;
+	XImage			fbimage;
+};
+
+struct xsp_context
+{
+	Display			*display;
+	int			screen;
+	Drawable		drawable;
+	int			drawable_bound;
+};
+
+struct xsp_buffer
+{
+	struct pipe_buffer	base;
+	boolean			is_user_buffer;
+	void			*data;
+	void			*mapped_data;
+};
+
+static struct pipe_buffer* xsp_buffer_create(struct pipe_winsys *pws, unsigned alignment, unsigned usage, unsigned size)
+{
+	struct xsp_buffer *buffer;
+
+	assert(pws);
+
+	buffer = calloc(1, sizeof(struct xsp_buffer));
+	buffer->base.refcount = 1;
+	buffer->base.alignment = alignment;
+	buffer->base.usage = usage;
+	buffer->base.size = size;
+	buffer->data = align_malloc(size, alignment);
+
+	return (struct pipe_buffer*)buffer;
+}
+
+static struct pipe_buffer* xsp_user_buffer_create(struct pipe_winsys *pws, void *data, unsigned size)
+{
+	struct xsp_buffer *buffer;
+
+	assert(pws);
+
+	buffer = calloc(1, sizeof(struct xsp_buffer));
+	buffer->base.refcount = 1;
+	buffer->base.size = size;
+	buffer->is_user_buffer = TRUE;
+	buffer->data = data;
+
+	return (struct pipe_buffer*)buffer;
+}
+
+static void* xsp_buffer_map(struct pipe_winsys *pws, struct pipe_buffer *buffer, unsigned flags)
+{
+	struct xsp_buffer *xsp_buf = (struct xsp_buffer*)buffer;
+
+	assert(pws);
+	assert(buffer);
+
+	xsp_buf->mapped_data = xsp_buf->data;
+
+	return xsp_buf->mapped_data;
+}
+
+static void xsp_buffer_unmap(struct pipe_winsys *pws, struct pipe_buffer *buffer)
+{
+	struct xsp_buffer *xsp_buf = (struct xsp_buffer*)buffer;
+
+	assert(pws);
+	assert(buffer);
+
+	xsp_buf->mapped_data = NULL;
+}
+
+static void xsp_buffer_destroy(struct pipe_winsys *pws, struct pipe_buffer *buffer)
+{
+	struct xsp_buffer *xsp_buf = (struct xsp_buffer*)buffer;
+
+	assert(pws);
+	assert(buffer);
+
+	if (!xsp_buf->is_user_buffer)
+		align_free(xsp_buf->data);
+
+	free(xsp_buf);
+}
+
+static struct pipe_surface* xsp_surface_alloc(struct pipe_winsys *pws)
+{
+	struct pipe_surface *surface;
+
+	assert(pws);
+
+	surface = calloc(1, sizeof(struct pipe_surface));
+	surface->refcount = 1;
+	surface->winsys = pws;
+
+	return surface;
+}
+
+/* Borrowed from Mesa's xm_winsys */
+static unsigned int round_up(unsigned n, unsigned multiple)
+{
+   return (n + multiple - 1) & ~(multiple - 1);
+}
+
+static int xsp_surface_alloc_storage
+(
+	struct pipe_winsys *pws,
+	struct pipe_surface *surface,
+	unsigned width,
+	unsigned height,
+	enum pipe_format format,
+	unsigned flags,
+	unsigned tex_usage
+)
+{
+	const unsigned int ALIGNMENT = 1;
+
+	assert(pws);
+	assert(surface);
+
+	surface->width = width;
+	surface->height = height;
+	surface->format = format;
+	pf_get_block(format, &surface->block);
+	surface->nblocksx = pf_get_nblocksx(&surface->block, width);
+	surface->nblocksy = pf_get_nblocksy(&surface->block, height);
+	surface->stride = round_up(surface->nblocksx * surface->block.size, ALIGNMENT);
+	surface->usage = flags;
+	surface->buffer = pws->buffer_create(pws, ALIGNMENT, PIPE_BUFFER_USAGE_PIXEL, surface->stride * surface->nblocksy);
+
+	return 0;
+}
+
+static void xsp_surface_release(struct pipe_winsys *pws, struct pipe_surface **surface)
+{
+	struct pipe_surface *s;
+
+	assert(pws);
+	assert(surface);
+	assert(*surface);
+
+	s = *surface;
+
+	s->refcount--;
+
+	if (s->refcount == 0)
+	{
+		winsys_buffer_reference(pws, &s->buffer, NULL);
+		free(s);
+	}
+
+	*surface = NULL;
+}
+
+static void xsp_fence_reference(struct pipe_winsys *pws, struct pipe_fence_handle **ptr, struct pipe_fence_handle *fence)
+{
+	assert(pws);
+	assert(ptr);
+	assert(fence);
+}
+
+static int xsp_fence_signalled(struct pipe_winsys *pws, struct pipe_fence_handle *fence, unsigned flag)
+{
+	assert(pws);
+	assert(fence);
+
+	return 0;
+}
+
+static int xsp_fence_finish(struct pipe_winsys *pws, struct pipe_fence_handle *fence, unsigned flag)
+{
+	assert(pws);
+	assert(fence);
+
+	return 0;
+}
+
+static void xsp_flush_frontbuffer(struct pipe_winsys *pws, struct pipe_surface *surface, void *context_private)
+{
+	struct xsp_pipe_winsys	*xsp_winsys;
+	struct xsp_context	*xsp_context;
+
+	assert(pws);
+	assert(surface);
+	assert(context_private);
+
+	xsp_winsys = (struct xsp_pipe_winsys*)pws;
+	xsp_context = (struct xsp_context*)context_private;
+
+	if (!xsp_context->drawable_bound)
+		return;
+
+	xsp_winsys->fbimage.width = surface->width;
+	xsp_winsys->fbimage.height = surface->height;
+	xsp_winsys->fbimage.bytes_per_line = surface->width * (xsp_winsys->fbimage.bits_per_pixel >> 3);
+	xsp_winsys->fbimage.data = pipe_surface_map(surface, 0);
+
+	XPutImage
+	(
+		xsp_context->display,
+		xsp_context->drawable,
+		XDefaultGC(xsp_context->display, xsp_context->screen),
+		&xsp_winsys->fbimage,
+		0,
+		0,
+		0,
+		0,
+		surface->width,
+		surface->height
+	);
+	XFlush(xsp_context->display);
+	pipe_surface_unmap(surface);
+}
+
+static const char* xsp_get_name(struct pipe_winsys *pws)
+{
+	assert(pws);
+	return "X11 SoftPipe";
+}
+
+/* Show starts here */
+
+int bind_pipe_drawable(struct pipe_context *pipe, Drawable drawable)
+{
+	struct xsp_context *xsp_context;
+
+	assert(pipe);
+
+	xsp_context = pipe->priv;
+	xsp_context->drawable = drawable;
+	xsp_context->drawable_bound = 1;
+
+	return 0;
+}
+
+int unbind_pipe_drawable(struct pipe_context *pipe)
+{
+	struct xsp_context *xsp_context;
+
+	assert(pipe);
+
+	xsp_context = pipe->priv;
+	xsp_context->drawable_bound = 0;
+
+	return 0;
+}
+
+struct pipe_context* create_pipe_context(Display *display, int screen)
+{
+	struct xsp_pipe_winsys	*xsp_winsys;
+	struct xsp_context	*xsp_context;
+	struct pipe_screen	*sp_screen;
+	struct pipe_context	*sp_pipe;
+
+	assert(display);
+
+	xsp_winsys = calloc(1, sizeof(struct xsp_pipe_winsys));
+	xsp_winsys->base.buffer_create = xsp_buffer_create;
+	xsp_winsys->base.user_buffer_create = xsp_user_buffer_create;
+	xsp_winsys->base.buffer_map = xsp_buffer_map;
+	xsp_winsys->base.buffer_unmap = xsp_buffer_unmap;
+	xsp_winsys->base.buffer_destroy = xsp_buffer_destroy;
+	xsp_winsys->base.surface_alloc = xsp_surface_alloc;
+	xsp_winsys->base.surface_alloc_storage = xsp_surface_alloc_storage;
+	xsp_winsys->base.surface_release = xsp_surface_release;
+	xsp_winsys->base.fence_reference = xsp_fence_reference;
+	xsp_winsys->base.fence_signalled = xsp_fence_signalled;
+	xsp_winsys->base.fence_finish = xsp_fence_finish;
+	xsp_winsys->base.flush_frontbuffer = xsp_flush_frontbuffer;
+	xsp_winsys->base.get_name = xsp_get_name;
+
+	{
+		/* XXX: Can't use the returned XImage* directly,
+		since we don't have control over winsys destruction
+		and we wouldn't be able to free it */
+		XImage *template = XCreateImage
+		(
+			display,
+			XDefaultVisual(display, XDefaultScreen(display)),
+			XDefaultDepth(display, XDefaultScreen(display)),
+			ZPixmap,
+			0,
+			NULL,
+			0,	/* Don't know the width and height until flush_frontbuffer */
+			0,
+			32,
+			0
+		);
+
+		memcpy(&xsp_winsys->fbimage, template, sizeof(XImage));
+		XInitImage(&xsp_winsys->fbimage);
+
+		XDestroyImage(template);
+	}
+
+	sp_screen = softpipe_create_screen((struct pipe_winsys*)xsp_winsys);
+	sp_pipe = softpipe_create(sp_screen, (struct pipe_winsys*)xsp_winsys, NULL);
+
+	xsp_context = calloc(1, sizeof(struct xsp_context));
+	xsp_context->display = display;
+	xsp_context->screen = screen;
+
+	sp_pipe->priv = xsp_context;
+
+	return sp_pipe;
+}
+
+int destroy_pipe_context(struct pipe_context *pipe)
+{
+	struct pipe_screen *screen;
+	struct pipe_winsys *winsys;
+
+	assert(pipe);
+
+	screen = pipe->screen;
+	winsys = pipe->winsys;
+	free(pipe->priv);
+	pipe->destroy(pipe);
+	screen->destroy(screen);
+	free(winsys);
+
+	return 0;
+}
diff --git a/src/gallium/winsys/gdi/SConscript b/src/gallium/winsys/gdi/SConscript
index dce81ec1ca2..cc6aa6634fd 100644
--- a/src/gallium/winsys/gdi/SConscript
+++ b/src/gallium/winsys/gdi/SConscript
@@ -8,9 +8,10 @@ if env['platform'] == 'windows':
 	env = env.Clone()
 
 	env.Append(CPPPATH = [
-		'#src/mesa/glapi',
-		'#src/mesa',
-		'#src/mesa/main',
+		'#src/mesa/state_tracker/wgl',
+	])
+
+	env.Append(CPPDEFINES = [
 	])
 
 	env.Append(CPPDEFINES = [
@@ -20,20 +21,21 @@ if env['platform'] == 'windows':
 	])
 
 	sources = [
-		'opengl32.def',
-		'wgl.c',
-		'wmesa.c',
+		'#src/mesa/state_tracker/wgl/opengl32.def',
+		'gdi_softpipe_winsys.c',
 	]
 		
 	drivers = [
 		softpipe,
 	]
 
-	env.Append(LIBS = ['gdi32', 'user32'])
+	env.Append(LIBS = [
+		'gdi32', 
+		'user32'
+	])
 
-	# TODO: write a wrapper function http://www.scons.org/wiki/WrapperFunctions
 	env.SharedLibrary(
 		target ='opengl32',
 		source = sources,
-		LIBS = glapi + mesa + drivers + auxiliaries + env['LIBS'],
+		LIBS = wgl + glapi + mesa + drivers + auxiliaries + env['LIBS'],
 	)
diff --git a/src/gallium/winsys/gdi/colors.h b/src/gallium/winsys/gdi/colors.h
deleted file mode 100644
index 03e512c1fa9..00000000000
--- a/src/gallium/winsys/gdi/colors.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Values for wmesa->pixelformat: */
-
-#define PF_8A8B8G8R	3	/* 32-bit TrueColor:  8-A, 8-B, 8-G, 8-R */
-#define PF_8R8G8B	4	/* 32-bit TrueColor:  8-R, 8-G, 8-B */
-#define PF_5R6G5B	5	/* 16-bit TrueColor:  5-R, 6-G, 5-B bits */
-#define PF_DITHER8	6	/* Dithered RGB using a lookup table */
-#define PF_LOOKUP	7	/* Undithered RGB using a lookup table */
-#define PF_GRAYSCALE	10	/* Grayscale or StaticGray */
-#define PF_BADFORMAT	11
-#define PF_INDEX8	12
-
-
-#define BGR8(r,g,b) (unsigned)(((BYTE)((b & 0xc0) | ((g & 0xe0)>>2) | \
-                                      ((r & 0xe0)>>5))))
-
-/* Windows uses 5,5,5 for 16-bit */
-#define BGR16(r,g,b) (  (((unsigned short)b       ) >> 3) | \
-                        (((unsigned short)g & 0xf8) << 2) | \
-                        (((unsigned short)r & 0xf8) << 7) )
-
-#define BGR24(r,g,b) (unsigned long)((DWORD)(((BYTE)(b)| \
-                                    ((WORD)((BYTE)(g))<<8))| \
-                                    (((DWORD)(BYTE)(r))<<16)))
-
-#define BGR32(r,g,b) (unsigned long)((DWORD)(((BYTE)(b)| \
-                                    ((WORD)((BYTE)(g))<<8))| \
-                                    (((DWORD)(BYTE)(r))<<16)))
-
-
diff --git a/src/gallium/winsys/gdi/gdi_softpipe_winsys.c b/src/gallium/winsys/gdi/gdi_softpipe_winsys.c
new file mode 100644
index 00000000000..e66ce48f2dc
--- /dev/null
+++ b/src/gallium/winsys/gdi/gdi_softpipe_winsys.c
@@ -0,0 +1,344 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Bismarck, ND., USA
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Softpipe support.
+ *
+ * @author Keith Whitwell
+ * @author Brian Paul
+ * @author Jose Fonseca
+ */
+
+
+#include <windows.h>
+
+#include "pipe/p_winsys.h"
+#include "pipe/p_format.h"
+#include "pipe/p_context.h"
+#include "pipe/p_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "softpipe/sp_winsys.h"
+#include "stw_winsys.h"
+
+
+struct gdi_softpipe_buffer
+{
+   struct pipe_buffer base;
+   boolean userBuffer;  /** Is this a user-space buffer? */
+   void *data;
+   void *mapped;
+};
+
+
+/** Cast wrapper */
+static INLINE struct gdi_softpipe_buffer *
+gdi_softpipe_buffer( struct pipe_buffer *buf )
+{
+   return (struct gdi_softpipe_buffer *)buf;
+}
+
+
+static void *
+gdi_softpipe_buffer_map(struct pipe_winsys *winsys,
+                        struct pipe_buffer *buf,
+                        unsigned flags)
+{
+   struct gdi_softpipe_buffer *gdi_softpipe_buf = gdi_softpipe_buffer(buf);
+   gdi_softpipe_buf->mapped = gdi_softpipe_buf->data;
+   return gdi_softpipe_buf->mapped;
+}
+
+
+static void
+gdi_softpipe_buffer_unmap(struct pipe_winsys *winsys,
+                          struct pipe_buffer *buf)
+{
+   struct gdi_softpipe_buffer *gdi_softpipe_buf = gdi_softpipe_buffer(buf);
+   gdi_softpipe_buf->mapped = NULL;
+}
+
+
+static void
+gdi_softpipe_buffer_destroy(struct pipe_winsys *winsys,
+                            struct pipe_buffer *buf)
+{
+   struct gdi_softpipe_buffer *oldBuf = gdi_softpipe_buffer(buf);
+
+   if (oldBuf->data) {
+      if (!oldBuf->userBuffer)
+         align_free(oldBuf->data);
+
+      oldBuf->data = NULL;
+   }
+
+   FREE(oldBuf);
+}
+
+
+static const char *
+gdi_softpipe_get_name(struct pipe_winsys *winsys)
+{
+   return "softpipe";
+}
+
+
+static struct pipe_buffer *
+gdi_softpipe_buffer_create(struct pipe_winsys *winsys,
+                           unsigned alignment,
+                           unsigned usage,
+                           unsigned size)
+{
+   struct gdi_softpipe_buffer *buffer = CALLOC_STRUCT(gdi_softpipe_buffer);
+
+   buffer->base.refcount = 1;
+   buffer->base.alignment = alignment;
+   buffer->base.usage = usage;
+   buffer->base.size = size;
+
+   buffer->data = align_malloc(size, alignment);
+
+   return &buffer->base;
+}
+
+
+/**
+ * Create buffer which wraps user-space data.
+ */
+static struct pipe_buffer *
+gdi_softpipe_user_buffer_create(struct pipe_winsys *winsys,
+                               void *ptr,
+                               unsigned bytes)
+{
+   struct gdi_softpipe_buffer *buffer;
+
+   buffer = CALLOC_STRUCT(gdi_softpipe_buffer);
+   if(!buffer)
+      return NULL;
+
+   buffer->base.refcount = 1;
+   buffer->base.size = bytes;
+   buffer->userBuffer = TRUE;
+   buffer->data = ptr;
+
+   return &buffer->base;
+}
+
+
+/**
+ * Round n up to next multiple.
+ */
+static INLINE unsigned
+round_up(unsigned n, unsigned multiple)
+{
+   return (n + multiple - 1) & ~(multiple - 1);
+}
+
+
+static int
+gdi_softpipe_surface_alloc_storage(struct pipe_winsys *winsys,
+                                   struct pipe_surface *surf,
+                                   unsigned width, unsigned height,
+                                   enum pipe_format format,
+                                   unsigned flags,
+                                   unsigned tex_usage)
+{
+   const unsigned alignment = 64;
+
+   surf->width = width;
+   surf->height = height;
+   surf->format = format;
+   pf_get_block(format, &surf->block);
+   surf->nblocksx = pf_get_nblocksx(&surf->block, width);
+   surf->nblocksy = pf_get_nblocksy(&surf->block, height);
+   surf->stride = round_up(surf->nblocksx * surf->block.size, alignment);
+   surf->usage = flags;
+
+   assert(!surf->buffer);
+   surf->buffer = winsys->buffer_create(winsys, alignment,
+                                        PIPE_BUFFER_USAGE_PIXEL,
+                                        surf->stride * surf->nblocksy);
+   if(!surf->buffer)
+      return -1;
+
+   return 0;
+}
+
+
+static struct pipe_surface *
+gdi_softpipe_surface_alloc(struct pipe_winsys *winsys)
+{
+   struct pipe_surface *surface = CALLOC_STRUCT(pipe_surface);
+
+   assert(winsys);
+
+   surface->refcount = 1;
+   surface->winsys = winsys;
+
+   return surface;
+}
+
+
+static void
+gdi_softpipe_surface_release(struct pipe_winsys *winsys,
+                             struct pipe_surface **s)
+{
+   struct pipe_surface *surf = *s;
+   assert(!surf->texture);
+   surf->refcount--;
+   if (surf->refcount == 0) {
+      if (surf->buffer)
+	winsys_buffer_reference(winsys, &surf->buffer, NULL);
+      free(surf);
+   }
+   *s = NULL;
+}
+
+
+static void
+gdi_softpipe_dummy_flush_frontbuffer(struct pipe_winsys *winsys,
+                                     struct pipe_surface *surface,
+                                     void *context_private)
+{
+   assert(0);
+}
+
+
+static void
+gdi_softpipe_fence_reference(struct pipe_winsys *winsys,
+                             struct pipe_fence_handle **ptr,
+                             struct pipe_fence_handle *fence)
+{
+}
+
+
+static int
+gdi_softpipe_fence_signalled(struct pipe_winsys *winsys,
+                             struct pipe_fence_handle *fence,
+                             unsigned flag)
+{
+   return 0;
+}
+
+
+static int
+gdi_softpipe_fence_finish(struct pipe_winsys *winsys,
+                          struct pipe_fence_handle *fence,
+                          unsigned flag)
+{
+   return 0;
+}
+
+
+static void
+gdi_softpipe_destroy(struct pipe_winsys *winsys)
+{
+   FREE(winsys);
+}
+
+
+static struct pipe_screen *
+gdi_softpipe_screen_create(void)
+{
+   static struct pipe_winsys *winsys;
+   struct pipe_screen *screen;
+
+   winsys = CALLOC_STRUCT(pipe_winsys);
+   if(!winsys)
+      return NULL;
+
+   winsys->destroy = gdi_softpipe_destroy;
+
+   winsys->buffer_create = gdi_softpipe_buffer_create;
+   winsys->user_buffer_create = gdi_softpipe_user_buffer_create;
+   winsys->buffer_map = gdi_softpipe_buffer_map;
+   winsys->buffer_unmap = gdi_softpipe_buffer_unmap;
+   winsys->buffer_destroy = gdi_softpipe_buffer_destroy;
+
+   winsys->surface_alloc = gdi_softpipe_surface_alloc;
+   winsys->surface_alloc_storage = gdi_softpipe_surface_alloc_storage;
+   winsys->surface_release = gdi_softpipe_surface_release;
+
+   winsys->fence_reference = gdi_softpipe_fence_reference;
+   winsys->fence_signalled = gdi_softpipe_fence_signalled;
+   winsys->fence_finish = gdi_softpipe_fence_finish;
+
+   winsys->flush_frontbuffer = gdi_softpipe_dummy_flush_frontbuffer;
+   winsys->get_name = gdi_softpipe_get_name;
+
+   screen = softpipe_create_screen(winsys);
+   if(!screen)
+      gdi_softpipe_destroy(winsys);
+
+   return screen;
+}
+
+
+static struct pipe_context *
+gdi_softpipe_context_create(struct pipe_screen *screen)
+{
+   return softpipe_create(screen, screen->winsys, NULL);
+}
+
+
+static void
+gdi_softpipe_flush_frontbuffer(struct pipe_winsys *winsys,
+                               struct pipe_surface *surface,
+                               HDC hDC)
+{
+    struct gdi_softpipe_buffer *buffer;
+    BITMAPINFO bmi;
+
+    buffer = gdi_softpipe_buffer(surface->buffer);
+
+    memset(&bmi, 0, sizeof(BITMAPINFO));
+    bmi.bmiHeader.biSize = sizeof(BITMAPINFOHEADER);
+    bmi.bmiHeader.biWidth = surface->stride / pf_get_size(surface->format);
+    bmi.bmiHeader.biHeight= -surface->height;
+    bmi.bmiHeader.biPlanes = 1;
+    bmi.bmiHeader.biBitCount = pf_get_bits(surface->format);
+    bmi.bmiHeader.biCompression = BI_RGB;
+    bmi.bmiHeader.biSizeImage = 0;
+    bmi.bmiHeader.biXPelsPerMeter = 0;
+    bmi.bmiHeader.biYPelsPerMeter = 0;
+    bmi.bmiHeader.biClrUsed = 0;
+    bmi.bmiHeader.biClrImportant = 0;
+
+    StretchDIBits(hDC,
+                  0, 0, surface->width, surface->height,
+                  0, 0, surface->width, surface->height,
+                  buffer->data, &bmi, 0, SRCCOPY);
+}
+
+
+const struct stw_winsys stw_winsys = {
+   &gdi_softpipe_screen_create,
+   &gdi_softpipe_context_create,
+   &gdi_softpipe_flush_frontbuffer
+};
diff --git a/src/gallium/winsys/gdi/opengl32.def b/src/gallium/winsys/gdi/opengl32.def
deleted file mode 100644
index 54e72f57b19..00000000000
--- a/src/gallium/winsys/gdi/opengl32.def
+++ /dev/null
@@ -1,859 +0,0 @@
-; DO NOT EDIT - This file generated automatically by mesadef.py script
-;DESCRIPTION 'Mesa (OpenGL work-alike) for Win32'
-VERSION 6.5
-;
-; Module definition file for Mesa (OPENGL32.DLL)
-;
-; Note: The OpenGL functions use the STDCALL
-; function calling convention.  Microsoft's
-; OPENGL32 uses this convention and so must the
-; Mesa OPENGL32 so that the Mesa DLL can be used
-; as a drop-in replacement.
-;
-; The linker exports STDCALL entry points with
-; 'decorated' names; e.g., _glBegin@0, where the
-; trailing number is the number of bytes of 
-; parameter data pushed onto the stack.  The
-; callee is responsible for popping this data
-; off the stack, usually via a RETF n instruction.
-;
-; However, the Microsoft OPENGL32.DLL does not export
-; the decorated names, even though the calling convention
-; is STDCALL.  So, this module definition file is
-; needed to force the Mesa OPENGL32.DLL to export the
-; symbols in the same manner as the Microsoft DLL.
-; Were it not for this problem, this file would not
-; be needed (for the gl* functions) since the entry
-; points are compiled with dllexport declspec.
-;
-; However, this file is still needed to export "internal"
-; Mesa symbols for the benefit of the OSMESA32.DLL.
-;
-EXPORTS
-	glNewList
-	glEndList
-	glCallList
-	glCallLists
-	glDeleteLists
-	glGenLists
-	glListBase
-	glBegin
-	glBitmap
-	glColor3b
-	glColor3bv
-	glColor3d
-	glColor3dv
-	glColor3f
-	glColor3fv
-	glColor3i
-	glColor3iv
-	glColor3s
-	glColor3sv
-	glColor3ub
-	glColor3ubv
-	glColor3ui
-	glColor3uiv
-	glColor3us
-	glColor3usv
-	glColor4b
-	glColor4bv
-	glColor4d
-	glColor4dv
-	glColor4f
-	glColor4fv
-	glColor4i
-	glColor4iv
-	glColor4s
-	glColor4sv
-	glColor4ub
-	glColor4ubv
-	glColor4ui
-	glColor4uiv
-	glColor4us
-	glColor4usv
-	glEdgeFlag
-	glEdgeFlagv
-	glEnd
-	glIndexd
-	glIndexdv
-	glIndexf
-	glIndexfv
-	glIndexi
-	glIndexiv
-	glIndexs
-	glIndexsv
-	glNormal3b
-	glNormal3bv
-	glNormal3d
-	glNormal3dv
-	glNormal3f
-	glNormal3fv
-	glNormal3i
-	glNormal3iv
-	glNormal3s
-	glNormal3sv
-	glRasterPos2d
-	glRasterPos2dv
-	glRasterPos2f
-	glRasterPos2fv
-	glRasterPos2i
-	glRasterPos2iv
-	glRasterPos2s
-	glRasterPos2sv
-	glRasterPos3d
-	glRasterPos3dv
-	glRasterPos3f
-	glRasterPos3fv
-	glRasterPos3i
-	glRasterPos3iv
-	glRasterPos3s
-	glRasterPos3sv
-	glRasterPos4d
-	glRasterPos4dv
-	glRasterPos4f
-	glRasterPos4fv
-	glRasterPos4i
-	glRasterPos4iv
-	glRasterPos4s
-	glRasterPos4sv
-	glRectd
-	glRectdv
-	glRectf
-	glRectfv
-	glRecti
-	glRectiv
-	glRects
-	glRectsv
-	glTexCoord1d
-	glTexCoord1dv
-	glTexCoord1f
-	glTexCoord1fv
-	glTexCoord1i
-	glTexCoord1iv
-	glTexCoord1s
-	glTexCoord1sv
-	glTexCoord2d
-	glTexCoord2dv
-	glTexCoord2f
-	glTexCoord2fv
-	glTexCoord2i
-	glTexCoord2iv
-	glTexCoord2s
-	glTexCoord2sv
-	glTexCoord3d
-	glTexCoord3dv
-	glTexCoord3f
-	glTexCoord3fv
-	glTexCoord3i
-	glTexCoord3iv
-	glTexCoord3s
-	glTexCoord3sv
-	glTexCoord4d
-	glTexCoord4dv
-	glTexCoord4f
-	glTexCoord4fv
-	glTexCoord4i
-	glTexCoord4iv
-	glTexCoord4s
-	glTexCoord4sv
-	glVertex2d
-	glVertex2dv
-	glVertex2f
-	glVertex2fv
-	glVertex2i
-	glVertex2iv
-	glVertex2s
-	glVertex2sv
-	glVertex3d
-	glVertex3dv
-	glVertex3f
-	glVertex3fv
-	glVertex3i
-	glVertex3iv
-	glVertex3s
-	glVertex3sv
-	glVertex4d
-	glVertex4dv
-	glVertex4f
-	glVertex4fv
-	glVertex4i
-	glVertex4iv
-	glVertex4s
-	glVertex4sv
-	glClipPlane
-	glColorMaterial
-	glCullFace
-	glFogf
-	glFogfv
-	glFogi
-	glFogiv
-	glFrontFace
-	glHint
-	glLightf
-	glLightfv
-	glLighti
-	glLightiv
-	glLightModelf
-	glLightModelfv
-	glLightModeli
-	glLightModeliv
-	glLineStipple
-	glLineWidth
-	glMaterialf
-	glMaterialfv
-	glMateriali
-	glMaterialiv
-	glPointSize
-	glPolygonMode
-	glPolygonStipple
-	glScissor
-	glShadeModel
-	glTexParameterf
-	glTexParameterfv
-	glTexParameteri
-	glTexParameteriv
-	glTexImage1D
-	glTexImage2D
-	glTexEnvf
-	glTexEnvfv
-	glTexEnvi
-	glTexEnviv
-	glTexGend
-	glTexGendv
-	glTexGenf
-	glTexGenfv
-	glTexGeni
-	glTexGeniv
-	glFeedbackBuffer
-	glSelectBuffer
-	glRenderMode
-	glInitNames
-	glLoadName
-	glPassThrough
-	glPopName
-	glPushName
-	glDrawBuffer
-	glClear
-	glClearAccum
-	glClearIndex
-	glClearColor
-	glClearStencil
-	glClearDepth
-	glStencilMask
-	glColorMask
-	glDepthMask
-	glIndexMask
-	glAccum
-	glDisable
-	glEnable
-	glFinish
-	glFlush
-	glPopAttrib
-	glPushAttrib
-	glMap1d
-	glMap1f
-	glMap2d
-	glMap2f
-	glMapGrid1d
-	glMapGrid1f
-	glMapGrid2d
-	glMapGrid2f
-	glEvalCoord1d
-	glEvalCoord1dv
-	glEvalCoord1f
-	glEvalCoord1fv
-	glEvalCoord2d
-	glEvalCoord2dv
-	glEvalCoord2f
-	glEvalCoord2fv
-	glEvalMesh1
-	glEvalPoint1
-	glEvalMesh2
-	glEvalPoint2
-	glAlphaFunc
-	glBlendFunc
-	glLogicOp
-	glStencilFunc
-	glStencilOp
-	glDepthFunc
-	glPixelZoom
-	glPixelTransferf
-	glPixelTransferi
-	glPixelStoref
-	glPixelStorei
-	glPixelMapfv
-	glPixelMapuiv
-	glPixelMapusv
-	glReadBuffer
-	glCopyPixels
-	glReadPixels
-	glDrawPixels
-	glGetBooleanv
-	glGetClipPlane
-	glGetDoublev
-	glGetError
-	glGetFloatv
-	glGetIntegerv
-	glGetLightfv
-	glGetLightiv
-	glGetMapdv
-	glGetMapfv
-	glGetMapiv
-	glGetMaterialfv
-	glGetMaterialiv
-	glGetPixelMapfv
-	glGetPixelMapuiv
-	glGetPixelMapusv
-	glGetPolygonStipple
-	glGetString
-	glGetTexEnvfv
-	glGetTexEnviv
-	glGetTexGendv
-	glGetTexGenfv
-	glGetTexGeniv
-	glGetTexImage
-	glGetTexParameterfv
-	glGetTexParameteriv
-	glGetTexLevelParameterfv
-	glGetTexLevelParameteriv
-	glIsEnabled
-	glIsList
-	glDepthRange
-	glFrustum
-	glLoadIdentity
-	glLoadMatrixf
-	glLoadMatrixd
-	glMatrixMode
-	glMultMatrixf
-	glMultMatrixd
-	glOrtho
-	glPopMatrix
-	glPushMatrix
-	glRotated
-	glRotatef
-	glScaled
-	glScalef
-	glTranslated
-	glTranslatef
-	glViewport
-	glArrayElement
-	glColorPointer
-	glDisableClientState
-	glDrawArrays
-	glDrawElements
-	glEdgeFlagPointer
-	glEnableClientState
-	glGetPointerv
-	glIndexPointer
-	glInterleavedArrays
-	glNormalPointer
-	glTexCoordPointer
-	glVertexPointer
-	glPolygonOffset
-	glCopyTexImage1D
-	glCopyTexImage2D
-	glCopyTexSubImage1D
-	glCopyTexSubImage2D
-	glTexSubImage1D
-	glTexSubImage2D
-	glAreTexturesResident
-	glBindTexture
-	glDeleteTextures
-	glGenTextures
-	glIsTexture
-	glPrioritizeTextures
-	glIndexub
-	glIndexubv
-	glPopClientAttrib
-	glPushClientAttrib
-	glBlendColor
-	glBlendEquation
-	glDrawRangeElements
-	glColorTable
-	glColorTableParameterfv
-	glColorTableParameteriv
-	glCopyColorTable
-	glGetColorTable
-	glGetColorTableParameterfv
-	glGetColorTableParameteriv
-	glColorSubTable
-	glCopyColorSubTable
-	glConvolutionFilter1D
-	glConvolutionFilter2D
-	glConvolutionParameterf
-	glConvolutionParameterfv
-	glConvolutionParameteri
-	glConvolutionParameteriv
-	glCopyConvolutionFilter1D
-	glCopyConvolutionFilter2D
-	glGetConvolutionFilter
-	glGetConvolutionParameterfv
-	glGetConvolutionParameteriv
-	glGetSeparableFilter
-	glSeparableFilter2D
-	glGetHistogram
-	glGetHistogramParameterfv
-	glGetHistogramParameteriv
-	glGetMinmax
-	glGetMinmaxParameterfv
-	glGetMinmaxParameteriv
-	glHistogram
-	glMinmax
-	glResetHistogram
-	glResetMinmax
-	glTexImage3D
-	glTexSubImage3D
-	glCopyTexSubImage3D
-	glActiveTextureARB
-	glClientActiveTextureARB
-	glMultiTexCoord1dARB
-	glMultiTexCoord1dvARB
-	glMultiTexCoord1fARB
-	glMultiTexCoord1fvARB
-	glMultiTexCoord1iARB
-	glMultiTexCoord1ivARB
-	glMultiTexCoord1sARB
-	glMultiTexCoord1svARB
-	glMultiTexCoord2dARB
-	glMultiTexCoord2dvARB
-	glMultiTexCoord2fARB
-	glMultiTexCoord2fvARB
-	glMultiTexCoord2iARB
-	glMultiTexCoord2ivARB
-	glMultiTexCoord2sARB
-	glMultiTexCoord2svARB
-	glMultiTexCoord3dARB
-	glMultiTexCoord3dvARB
-	glMultiTexCoord3fARB
-	glMultiTexCoord3fvARB
-	glMultiTexCoord3iARB
-	glMultiTexCoord3ivARB
-	glMultiTexCoord3sARB
-	glMultiTexCoord3svARB
-	glMultiTexCoord4dARB
-	glMultiTexCoord4dvARB
-	glMultiTexCoord4fARB
-	glMultiTexCoord4fvARB
-	glMultiTexCoord4iARB
-	glMultiTexCoord4ivARB
-	glMultiTexCoord4sARB
-	glMultiTexCoord4svARB
-	glLoadTransposeMatrixfARB
-	glLoadTransposeMatrixdARB
-	glMultTransposeMatrixfARB
-	glMultTransposeMatrixdARB
-	glSampleCoverageARB
-	glCompressedTexImage3DARB
-	glCompressedTexImage2DARB
-	glCompressedTexImage1DARB
-	glCompressedTexSubImage3DARB
-	glCompressedTexSubImage2DARB
-	glCompressedTexSubImage1DARB
-	glGetCompressedTexImageARB
-	glActiveTexture
-	glClientActiveTexture
-	glMultiTexCoord1d
-	glMultiTexCoord1dv
-	glMultiTexCoord1f
-	glMultiTexCoord1fv
-	glMultiTexCoord1i
-	glMultiTexCoord1iv
-	glMultiTexCoord1s
-	glMultiTexCoord1sv
-	glMultiTexCoord2d
-	glMultiTexCoord2dv
-	glMultiTexCoord2f
-	glMultiTexCoord2fv
-	glMultiTexCoord2i
-	glMultiTexCoord2iv
-	glMultiTexCoord2s
-	glMultiTexCoord2sv
-	glMultiTexCoord3d
-	glMultiTexCoord3dv
-	glMultiTexCoord3f
-	glMultiTexCoord3fv
-	glMultiTexCoord3i
-	glMultiTexCoord3iv
-	glMultiTexCoord3s
-	glMultiTexCoord3sv
-	glMultiTexCoord4d
-	glMultiTexCoord4dv
-	glMultiTexCoord4f
-	glMultiTexCoord4fv
-	glMultiTexCoord4i
-	glMultiTexCoord4iv
-	glMultiTexCoord4s
-	glMultiTexCoord4sv
-	glLoadTransposeMatrixf
-	glLoadTransposeMatrixd
-	glMultTransposeMatrixf
-	glMultTransposeMatrixd
-	glSampleCoverage
-	glCompressedTexImage3D
-	glCompressedTexImage2D
-	glCompressedTexImage1D
-	glCompressedTexSubImage3D
-	glCompressedTexSubImage2D
-	glCompressedTexSubImage1D
-	glGetCompressedTexImage
-	glBlendColorEXT
-	glPolygonOffsetEXT
-	glTexImage3DEXT
-	glTexSubImage3DEXT
-	glTexSubImage1DEXT
-	glTexSubImage2DEXT
-	glCopyTexImage1DEXT
-	glCopyTexImage2DEXT
-	glCopyTexSubImage1DEXT
-	glCopyTexSubImage2DEXT
-	glCopyTexSubImage3DEXT
-	glAreTexturesResidentEXT
-	glBindTextureEXT
-	glDeleteTexturesEXT
-	glGenTexturesEXT
-	glIsTextureEXT
-	glPrioritizeTexturesEXT
-	glArrayElementEXT
-	glColorPointerEXT
-	glDrawArraysEXT
-	glEdgeFlagPointerEXT
-	glGetPointervEXT
-	glIndexPointerEXT
-	glNormalPointerEXT
-	glTexCoordPointerEXT
-	glVertexPointerEXT
-	glBlendEquationEXT
-	glPointParameterfEXT
-	glPointParameterfvEXT
-	glPointParameterfARB
-	glPointParameterfvARB
-	glColorTableEXT
-	glGetColorTableEXT
-	glGetColorTableParameterivEXT
-	glGetColorTableParameterfvEXT
-	glLockArraysEXT
-	glUnlockArraysEXT
-	glDrawRangeElementsEXT
-	glSecondaryColor3bEXT
-	glSecondaryColor3bvEXT
-	glSecondaryColor3dEXT
-	glSecondaryColor3dvEXT
-	glSecondaryColor3fEXT
-	glSecondaryColor3fvEXT
-	glSecondaryColor3iEXT
-	glSecondaryColor3ivEXT
-	glSecondaryColor3sEXT
-	glSecondaryColor3svEXT
-	glSecondaryColor3ubEXT
-	glSecondaryColor3ubvEXT
-	glSecondaryColor3uiEXT
-	glSecondaryColor3uivEXT
-	glSecondaryColor3usEXT
-	glSecondaryColor3usvEXT
-	glSecondaryColorPointerEXT
-	glMultiDrawArraysEXT
-	glMultiDrawElementsEXT
-	glFogCoordfEXT
-	glFogCoordfvEXT
-	glFogCoorddEXT
-	glFogCoorddvEXT
-	glFogCoordPointerEXT
-	glBlendFuncSeparateEXT
-	glFlushVertexArrayRangeNV
-	glVertexArrayRangeNV
-	glCombinerParameterfvNV
-	glCombinerParameterfNV
-	glCombinerParameterivNV
-	glCombinerParameteriNV
-	glCombinerInputNV
-	glCombinerOutputNV
-	glFinalCombinerInputNV
-	glGetCombinerInputParameterfvNV
-	glGetCombinerInputParameterivNV
-	glGetCombinerOutputParameterfvNV
-	glGetCombinerOutputParameterivNV
-	glGetFinalCombinerInputParameterfvNV
-	glGetFinalCombinerInputParameterivNV
-	glResizeBuffersMESA
-	glWindowPos2dMESA
-	glWindowPos2dvMESA
-	glWindowPos2fMESA
-	glWindowPos2fvMESA
-	glWindowPos2iMESA
-	glWindowPos2ivMESA
-	glWindowPos2sMESA
-	glWindowPos2svMESA
-	glWindowPos3dMESA
-	glWindowPos3dvMESA
-	glWindowPos3fMESA
-	glWindowPos3fvMESA
-	glWindowPos3iMESA
-	glWindowPos3ivMESA
-	glWindowPos3sMESA
-	glWindowPos3svMESA
-	glWindowPos4dMESA
-	glWindowPos4dvMESA
-	glWindowPos4fMESA
-	glWindowPos4fvMESA
-	glWindowPos4iMESA
-	glWindowPos4ivMESA
-	glWindowPos4sMESA
-	glWindowPos4svMESA
-	glWindowPos2dARB
-	glWindowPos2fARB
-	glWindowPos2iARB
-	glWindowPos2sARB
-	glWindowPos2dvARB
-	glWindowPos2fvARB
-	glWindowPos2ivARB
-	glWindowPos2svARB
-	glWindowPos3dARB
-	glWindowPos3fARB
-	glWindowPos3iARB
-	glWindowPos3sARB
-	glWindowPos3dvARB
-	glWindowPos3fvARB
-	glWindowPos3ivARB
-	glWindowPos3svARB
-	glAreProgramsResidentNV
-	glBindProgramNV
-	glDeleteProgramsNV
-	glExecuteProgramNV
-	glGenProgramsNV
-	glGetProgramParameterdvNV
-	glGetProgramParameterfvNV
-	glGetProgramivNV
-	glGetProgramStringNV
-	glGetTrackMatrixivNV
-	glGetVertexAttribdvNV
-	glGetVertexAttribfvNV
-	glGetVertexAttribivNV
-	glGetVertexAttribPointervNV
-	glIsProgramNV
-	glLoadProgramNV
-	glProgramParameter4dNV
-	glProgramParameter4dvNV
-	glProgramParameter4fNV
-	glProgramParameter4fvNV
-	glProgramParameters4dvNV
-	glProgramParameters4fvNV
-	glRequestResidentProgramsNV
-	glTrackMatrixNV
-	glVertexAttribPointerNV
-	glVertexAttrib1dNV
-	glVertexAttrib1dvNV
-	glVertexAttrib1fNV
-	glVertexAttrib1fvNV
-	glVertexAttrib1sNV
-	glVertexAttrib1svNV
-	glVertexAttrib2dNV
-	glVertexAttrib2dvNV
-	glVertexAttrib2fNV
-	glVertexAttrib2fvNV
-	glVertexAttrib2sNV
-	glVertexAttrib2svNV
-	glVertexAttrib3dNV
-	glVertexAttrib3dvNV
-	glVertexAttrib3fNV
-	glVertexAttrib3fvNV
-	glVertexAttrib3sNV
-	glVertexAttrib3svNV
-	glVertexAttrib4dNV
-	glVertexAttrib4dvNV
-	glVertexAttrib4fNV
-	glVertexAttrib4fvNV
-	glVertexAttrib4sNV
-	glVertexAttrib4svNV
-	glVertexAttrib4ubNV
-	glVertexAttrib4ubvNV
-	glVertexAttribs1dvNV
-	glVertexAttribs1fvNV
-	glVertexAttribs1svNV
-	glVertexAttribs2dvNV
-	glVertexAttribs2fvNV
-	glVertexAttribs2svNV
-	glVertexAttribs3dvNV
-	glVertexAttribs3fvNV
-	glVertexAttribs3svNV
-	glVertexAttribs4dvNV
-	glVertexAttribs4fvNV
-	glVertexAttribs4svNV
-	glVertexAttribs4ubvNV
-	glPointParameteriNV
-	glPointParameterivNV
-	glFogCoordf
-	glFogCoordfv
-	glFogCoordd
-	glFogCoorddv
-	glFogCoordPointer
-	glMultiDrawArrays
-	glMultiDrawElements
-	glPointParameterf
-	glPointParameterfv
-	glPointParameteri
-	glPointParameteriv
-	glSecondaryColor3b
-	glSecondaryColor3bv
-	glSecondaryColor3d
-	glSecondaryColor3dv
-	glSecondaryColor3f
-	glSecondaryColor3fv
-	glSecondaryColor3i
-	glSecondaryColor3iv
-	glSecondaryColor3s
-	glSecondaryColor3sv
-	glSecondaryColor3ub
-	glSecondaryColor3ubv
-	glSecondaryColor3ui
-	glSecondaryColor3uiv
-	glSecondaryColor3us
-	glSecondaryColor3usv
-	glSecondaryColorPointer
-	glWindowPos2d
-	glWindowPos2dv
-	glWindowPos2f
-	glWindowPos2fv
-	glWindowPos2i
-	glWindowPos2iv
-	glWindowPos2s
-	glWindowPos2sv
-	glWindowPos3d
-	glWindowPos3dv
-	glWindowPos3f
-	glWindowPos3fv
-	glWindowPos3i
-	glWindowPos3iv
-	glWindowPos3s
-	glWindowPos3sv
-	glVertexAttrib1sARB
-	glVertexAttrib1fARB
-	glVertexAttrib1dARB
-	glVertexAttrib2sARB
-	glVertexAttrib2fARB
-	glVertexAttrib2dARB
-	glVertexAttrib3sARB
-	glVertexAttrib3fARB
-	glVertexAttrib3dARB
-	glVertexAttrib4sARB
-	glVertexAttrib4fARB
-	glVertexAttrib4dARB
-	glVertexAttrib4NubARB
-	glVertexAttrib1svARB
-	glVertexAttrib1fvARB
-	glVertexAttrib1dvARB
-	glVertexAttrib2svARB
-	glVertexAttrib2fvARB
-	glVertexAttrib2dvARB
-	glVertexAttrib3svARB
-	glVertexAttrib3fvARB
-	glVertexAttrib3dvARB
-	glVertexAttrib4bvARB
-	glVertexAttrib4svARB
-	glVertexAttrib4ivARB
-	glVertexAttrib4ubvARB
-	glVertexAttrib4usvARB
-	glVertexAttrib4uivARB
-	glVertexAttrib4fvARB
-	glVertexAttrib4dvARB
-	glVertexAttrib4NbvARB
-	glVertexAttrib4NsvARB
-	glVertexAttrib4NivARB
-	glVertexAttrib4NubvARB
-	glVertexAttrib4NusvARB
-	glVertexAttrib4NuivARB
-	glVertexAttribPointerARB
-	glEnableVertexAttribArrayARB
-	glDisableVertexAttribArrayARB
-	glProgramStringARB
-	glBindProgramARB
-	glDeleteProgramsARB
-	glGenProgramsARB
-	glIsProgramARB
-	glProgramEnvParameter4dARB
-	glProgramEnvParameter4dvARB
-	glProgramEnvParameter4fARB
-	glProgramEnvParameter4fvARB
-	glProgramLocalParameter4dARB
-	glProgramLocalParameter4dvARB
-	glProgramLocalParameter4fARB
-	glProgramLocalParameter4fvARB
-	glGetProgramEnvParameterdvARB
-	glGetProgramEnvParameterfvARB
-	glGetProgramLocalParameterdvARB
-	glGetProgramLocalParameterfvARB
-	glGetProgramivARB
-	glGetProgramStringARB
-	glGetVertexAttribdvARB
-	glGetVertexAttribfvARB
-	glGetVertexAttribivARB
-	glGetVertexAttribPointervARB
-	glProgramNamedParameter4fNV
-	glProgramNamedParameter4dNV
-	glProgramNamedParameter4fvNV
-	glProgramNamedParameter4dvNV
-	glGetProgramNamedParameterfvNV
-	glGetProgramNamedParameterdvNV
-	glBindBufferARB
-	glBufferDataARB
-	glBufferSubDataARB
-	glDeleteBuffersARB
-	glGenBuffersARB
-	glGetBufferParameterivARB
-	glGetBufferPointervARB
-	glGetBufferSubDataARB
-	glIsBufferARB
-	glMapBufferARB
-	glUnmapBufferARB
-	glGenQueriesARB
-	glDeleteQueriesARB
-	glIsQueryARB
-	glBeginQueryARB
-	glEndQueryARB
-	glGetQueryivARB
-	glGetQueryObjectivARB
-	glGetQueryObjectuivARB
-	glBindBuffer
-	glBufferData
-	glBufferSubData
-	glDeleteBuffers
-	glGenBuffers
-	glGetBufferParameteriv
-	glGetBufferPointerv
-	glGetBufferSubData
-	glIsBuffer
-	glMapBuffer
-	glUnmapBuffer
-	glGenQueries
-	glDeleteQueries
-	glIsQuery
-	glBeginQuery
-	glEndQuery
-	glGetQueryiv
-	glGetQueryObjectiv
-	glGetQueryObjectuiv
-;
-; WGL API
-	wglChoosePixelFormat
-	wglCopyContext
-	wglCreateContext
-	wglCreateLayerContext
-	wglDeleteContext
-	wglDescribeLayerPlane
-	wglDescribePixelFormat
-	wglGetCurrentContext
-	wglGetCurrentDC
-	wglGetLayerPaletteEntries
-	wglGetPixelFormat
-	wglGetProcAddress
-	wglMakeCurrent
-	wglRealizeLayerPalette
-	wglSetLayerPaletteEntries
-	wglSetPixelFormat
-	wglShareLists
-	wglSwapBuffers
-	wglSwapLayerBuffers
-	wglUseFontBitmapsA
-	wglUseFontBitmapsW
-	wglUseFontOutlinesA
-	wglUseFontOutlinesW
-	wglGetExtensionsStringARB
diff --git a/src/gallium/winsys/gdi/wgl.c b/src/gallium/winsys/gdi/wgl.c
deleted file mode 100644
index 3ce470480d4..00000000000
--- a/src/gallium/winsys/gdi/wgl.c
+++ /dev/null
@@ -1,701 +0,0 @@
-/*
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Library General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Library General Public License for more details.
- *
- * You should have received a copy of the GNU Library General Public
- * License along with this library; if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-/*
- * File name 	: wgl.c
- * WGL stuff. Added by Oleg Letsinsky, [email protected]
- * Some things originated from the 3Dfx WGL functions
- */
-
-/* 
- * This file contains the implementation of the wgl* functions for
- * Mesa on Windows.  Since these functions are provided by Windows in
- * GDI/OpenGL, we must supply our versions that work with Mesa here.
- */
-
-
-/* We're essentially building part of GDI here, so define this so that
- * we get the right export linkage. */
-#ifdef __MINGW32__
-
-#include <stdarg.h>
-#include <windef.h>
-#include <wincon.h>
-#include <winbase.h>
-
-#  if defined(BUILD_GL32)
-#    define WINGDIAPI __declspec(dllexport)	
-#  else
-#    define __W32API_USE_DLLIMPORT__
-#  endif
-
-#include <wingdi.h>
-#include "GL/mesa_wgl.h"
-#include <stdlib.h>
-
-#else
-
-#define _GDI32_
-#include <windows.h>
-
-#endif
-
-#include "glapi.h"
-#include "GL/wmesa.h"   /* protos for wmesa* functions */
-
-/*
- * Pixel Format Descriptors
- */
-
-/* Extend the PFD to include DB flag */
-struct __pixelformat__
-{
-    PIXELFORMATDESCRIPTOR pfd;
-    GLboolean doubleBuffered;
-};
-
-/* These are the PFD's supported by this driver. */
-struct __pixelformat__	pfd[] =
-{
-    /* Double Buffer, alpha */
-    {	
-	{	
-	    sizeof(PIXELFORMATDESCRIPTOR),	1,
-	    PFD_DRAW_TO_WINDOW|PFD_SUPPORT_OPENGL|
-	    PFD_GENERIC_FORMAT|PFD_DOUBLEBUFFER|PFD_SWAP_COPY,
-	    PFD_TYPE_RGBA,
-	    24,	
-	    8, 0,	
-	    8, 8,	
-	    8, 16,	
-	    8, 24,
-	    0, 0, 0, 0, 0,	
-	    16,	8,	
-	    0, 0, 0,	
-	    0, 0, 0 
-	},
-        GL_TRUE
-    },
-    /* Single Buffer, alpha */
-    {	
-	{	
-	    sizeof(PIXELFORMATDESCRIPTOR),	1,
-	    PFD_DRAW_TO_WINDOW|PFD_SUPPORT_OPENGL|
-	    PFD_GENERIC_FORMAT,
-	    PFD_TYPE_RGBA,
-	    24,	
-	    8, 0,	
-	    8, 8,	
-	    8, 16,	
-	    8, 24,
-	    0, 0, 0, 0,	0,	
-	    16,	8,
-	    0, 0, 0,	
-	    0, 0, 0
-	},
-        GL_FALSE
-    },
-    /* Double Buffer, no alpha */
-    {	
-	{	
-	    sizeof(PIXELFORMATDESCRIPTOR),	1,
-	    PFD_DRAW_TO_WINDOW|PFD_SUPPORT_OPENGL|
-	    PFD_GENERIC_FORMAT|PFD_DOUBLEBUFFER|PFD_SWAP_COPY,
-	    PFD_TYPE_RGBA,
-	    24,	
-	    8, 0,
-	    8, 8,
-	    8, 16,
-	    0, 0,
-	    0, 0, 0, 0,	0,
-	    16,	8,
-	    0, 0, 0, 
-	    0, 0, 0 
-	},
-        GL_TRUE
-    },
-    /* Single Buffer, no alpha */
-    {	
-	{
-	    sizeof(PIXELFORMATDESCRIPTOR),	1,
-	    PFD_DRAW_TO_WINDOW|PFD_SUPPORT_OPENGL|
-	    PFD_GENERIC_FORMAT,
-	    PFD_TYPE_RGBA,
-	    24,	
-	    8, 0,
-	    8, 8,
-	    8, 16,
-	    0, 0,
-	    0, 0, 0, 0,	0,
-	    16,	8,
-	    0, 0, 0,
-	    0, 0, 0 
-	},
-        GL_FALSE
-    },
-};
-
-int npfd = sizeof(pfd) / sizeof(pfd[0]);
-
-
-/*
- * Contexts
- */
-
-typedef struct {
-    WMesaContext ctx;
-} MesaWglCtx;
-
-#define MESAWGL_CTX_MAX_COUNT 20
-
-static MesaWglCtx wgl_ctx[MESAWGL_CTX_MAX_COUNT];
-
-static unsigned ctx_count = 0;
-static int ctx_current = -1;
-static unsigned curPFD = 0;
-
-static HDC CurrentHDC = 0;
-
-
-WINGDIAPI HGLRC GLAPIENTRY wglCreateContext(HDC hdc)
-{
-    int i = 0;
-    if (!ctx_count) {
-	for(i=0;i<MESAWGL_CTX_MAX_COUNT;i++) {
-	    wgl_ctx[i].ctx = NULL;
-	}
-    }
-    for( i = 0; i < MESAWGL_CTX_MAX_COUNT; i++ ) {
-        if ( wgl_ctx[i].ctx == NULL ) {
-            wgl_ctx[i].ctx = 
-		WMesaCreateContext(hdc, NULL, (GLboolean)GL_TRUE,
-				   (GLboolean) (pfd[curPFD-1].doubleBuffered ?
-                                   GL_TRUE : GL_FALSE), 
-				   (GLboolean)(pfd[curPFD-1].pfd.cAlphaBits ? 
-				   GL_TRUE : GL_FALSE) );
-            if (wgl_ctx[i].ctx == NULL)
-                break;
-            ctx_count++;
-            return ((HGLRC)wgl_ctx[i].ctx);
-        }
-    }
-    SetLastError(0);
-    return(NULL);
-}
-
-WINGDIAPI BOOL GLAPIENTRY wglDeleteContext(HGLRC hglrc)
-{
-    int i;
-    for ( i = 0; i < MESAWGL_CTX_MAX_COUNT; i++ ) {
-    	if ( wgl_ctx[i].ctx == (WMesaContext) hglrc ){
-            WMesaMakeCurrent((WMesaContext) hglrc, NULL);
-            WMesaDestroyContext(wgl_ctx[i].ctx);
-            wgl_ctx[i].ctx = NULL;
-            ctx_count--;
-            return(TRUE);
-    	}
-    }
-    SetLastError(0);
-    return(FALSE);
-}
-
-WINGDIAPI HGLRC GLAPIENTRY wglGetCurrentContext(VOID)
-{
-    if (ctx_current < 0)
-	return 0;
-    else
-	return (HGLRC) wgl_ctx[ctx_current].ctx;
-}
-
-WINGDIAPI HDC GLAPIENTRY wglGetCurrentDC(VOID)
-{
-    return CurrentHDC;
-}
-
-WINGDIAPI BOOL GLAPIENTRY wglMakeCurrent(HDC hdc, HGLRC hglrc)
-{
-    int i;
-    
-    CurrentHDC = hdc;
-
-    if (!hdc || !hglrc) {
-	WMesaMakeCurrent(NULL, NULL);
-	ctx_current = -1;
-	return TRUE;
-    }
-    
-    for ( i = 0; i < MESAWGL_CTX_MAX_COUNT; i++ ) {
-	if ( wgl_ctx[i].ctx == (WMesaContext) hglrc ) {
-	    WMesaMakeCurrent( (WMesaContext) hglrc, hdc );
-	    ctx_current = i;
-	    return TRUE;
-	}
-    }
-    return FALSE;
-}
-
-
-WINGDIAPI int GLAPIENTRY wglChoosePixelFormat(HDC hdc,
-					      CONST 
-					      PIXELFORMATDESCRIPTOR *ppfd)
-{
-    int		i,best = -1,bestdelta = 0x7FFFFFFF,delta;
-    (void) hdc;
-    
-    if(ppfd->nSize != sizeof(PIXELFORMATDESCRIPTOR) || ppfd->nVersion != 1)
-	{
-	    SetLastError(0);
-	    return(0);
-	}
-    for(i = 0; i < npfd;i++)
-	{
-	    delta = 0;
-	    if(
-		(ppfd->dwFlags & PFD_DRAW_TO_WINDOW) &&
-		!(pfd[i].pfd.dwFlags & PFD_DRAW_TO_WINDOW))
-		continue;
-	    if(
-		(ppfd->dwFlags & PFD_DRAW_TO_BITMAP) &&
-		!(pfd[i].pfd.dwFlags & PFD_DRAW_TO_BITMAP))
-		continue;
-	    if(
-		(ppfd->dwFlags & PFD_SUPPORT_GDI) &&
-		!(pfd[i].pfd.dwFlags & PFD_SUPPORT_GDI))
-		continue;
-	    if(
-		(ppfd->dwFlags & PFD_SUPPORT_OPENGL) &&
-		!(pfd[i].pfd.dwFlags & PFD_SUPPORT_OPENGL))
-		continue;
-	    if(
-		!(ppfd->dwFlags & PFD_DOUBLEBUFFER_DONTCARE) &&
-		((ppfd->dwFlags & PFD_DOUBLEBUFFER) != 
-		 (pfd[i].pfd.dwFlags & PFD_DOUBLEBUFFER)))
-		continue;
-	    if(
-		!(ppfd->dwFlags & PFD_STEREO_DONTCARE) &&
-		((ppfd->dwFlags & PFD_STEREO) != 
-		 (pfd[i].pfd.dwFlags & PFD_STEREO)))
-		continue;
-	    if(ppfd->iPixelType != pfd[i].pfd.iPixelType)
-		delta++;
-	    if(ppfd->cAlphaBits != pfd[i].pfd.cAlphaBits)
-		delta++;
-	    if(delta < bestdelta)
-		{
-		    best = i + 1;
-		    bestdelta = delta;
-		    if(bestdelta == 0)
-			break;
-		}
-	}
-    if(best == -1)
-	{
-	    SetLastError(0);
-	    return(0);
-	}
-    return(best);
-}
-
-WINGDIAPI int GLAPIENTRY wglDescribePixelFormat(HDC hdc,
-					        int iPixelFormat,
-					        UINT nBytes,
-					        LPPIXELFORMATDESCRIPTOR ppfd)
-{
-    (void) hdc;
-    
-    if(ppfd == NULL)
-	return(npfd);
-    if(iPixelFormat < 1 || iPixelFormat > npfd || 
-       nBytes != sizeof(PIXELFORMATDESCRIPTOR))
-	{
-	    SetLastError(0);
-	    return(0);
-	}
-    *ppfd = pfd[iPixelFormat - 1].pfd;
-    return(npfd);
-}
-
-WINGDIAPI PROC GLAPIENTRY wglGetProcAddress(LPCSTR lpszProc)
-{
-    PROC p = (PROC) _glapi_get_proc_address((const char *) lpszProc);
-    if (p)
-	return p;
-    
-    SetLastError(0);
-    return(NULL);
-}
-
-WINGDIAPI int GLAPIENTRY wglGetPixelFormat(HDC hdc)
-{
-    (void) hdc;
-    if(curPFD == 0) {
-	SetLastError(0);
-	return(0);
-    }
-    return(curPFD);
-}
-
-WINGDIAPI BOOL GLAPIENTRY wglSetPixelFormat(HDC hdc,int iPixelFormat,
-					const PIXELFORMATDESCRIPTOR *ppfd)
-{
-    (void) hdc;
-    
-    if(iPixelFormat < 1 || iPixelFormat > npfd || 
-       ppfd->nSize != sizeof(PIXELFORMATDESCRIPTOR)) {
-	SetLastError(0);
-	return(FALSE);
-    }
-    curPFD = iPixelFormat;
-    return(TRUE);
-}
-
-WINGDIAPI BOOL GLAPIENTRY wglSwapBuffers(HDC hdc)
-{
-    WMesaSwapBuffers(hdc);
-    return TRUE;
-}
-
-static FIXED FixedFromDouble(double d)
-{
-   long l = (long) (d * 65536L);
-   return *(FIXED *) (void *) &l;
-}
-
-
-/*
-** This is cribbed from FX/fxwgl.c, and seems to implement support
-** for bitmap fonts where the wglUseFontBitmapsA() code implements
-** support for outline fonts.  In combination they hopefully give
-** fairly generic support for fonts.
-*/
-static BOOL wglUseFontBitmaps_FX(HDC fontDevice, DWORD firstChar,
-                                 DWORD numChars, DWORD listBase)
-{
-#define VERIFY(a) a
-    
-    TEXTMETRIC metric;
-    BITMAPINFO *dibInfo;
-    HDC bitDevice;
-    COLORREF tempColor;
-    int i;
-    
-    VERIFY(GetTextMetrics(fontDevice, &metric));
-    
-    dibInfo = (BITMAPINFO *) calloc(sizeof(BITMAPINFO) + sizeof(RGBQUAD), 1);
-    dibInfo->bmiHeader.biSize = sizeof(BITMAPINFOHEADER);
-    dibInfo->bmiHeader.biPlanes = 1;
-    dibInfo->bmiHeader.biBitCount = 1;
-    dibInfo->bmiHeader.biCompression = BI_RGB;
-    
-    bitDevice = CreateCompatibleDC(fontDevice);
-    
-    /* Swap fore and back colors so the bitmap has the right polarity */
-    tempColor = GetBkColor(bitDevice);
-    SetBkColor(bitDevice, GetTextColor(bitDevice));
-    SetTextColor(bitDevice, tempColor);
-    
-    /* Place chars based on base line */
-    VERIFY(SetTextAlign(bitDevice, TA_BASELINE) != GDI_ERROR ? 1 : 0);
-    
-    for(i = 0; i < (int)numChars; i++) {
-	SIZE size;
-	char curChar;
-	int charWidth,charHeight,bmapWidth,bmapHeight,numBytes,res;
-	HBITMAP bitObject;
-	HGDIOBJ origBmap;
-	unsigned char *bmap;
-	
-	curChar = (char)(i + firstChar);
-	
-	/* Find how high/wide this character is */
-	VERIFY(GetTextExtentPoint32(bitDevice, (LPCWSTR)&curChar, 1, &size));
-	
-	/* Create the output bitmap */
-	charWidth = size.cx;
-	charHeight = size.cy;
-	/* Round up to the next multiple of 32 bits */
-	bmapWidth = ((charWidth + 31) / 32) * 32;   
-	bmapHeight = charHeight;
-	bitObject = CreateCompatibleBitmap(bitDevice,
-					   bmapWidth,
-					   bmapHeight);
-	/* VERIFY(bitObject); */
-	
-	/* Assign the output bitmap to the device */
-	origBmap = SelectObject(bitDevice, bitObject);
-	(void) VERIFY(origBmap);
-	
-	VERIFY( PatBlt( bitDevice, 0, 0, bmapWidth, bmapHeight,BLACKNESS ) );
-	
-	/* Use our source font on the device */
-	VERIFY(SelectObject(bitDevice, GetCurrentObject(fontDevice,OBJ_FONT)));
-	
-	/* Draw the character */
-	VERIFY(TextOut(bitDevice, 0, metric.tmAscent, (LPCWSTR)&curChar, 1));
-	
-	/* Unselect our bmap object */
-	VERIFY(SelectObject(bitDevice, origBmap));
-	
-	/* Convert the display dependant representation to a 1 bit deep DIB */
-	numBytes = (bmapWidth * bmapHeight) / 8;
-	bmap = (unsigned char *)malloc(numBytes);
-	dibInfo->bmiHeader.biWidth = bmapWidth;
-	dibInfo->bmiHeader.biHeight = bmapHeight;
-	res = GetDIBits(bitDevice, bitObject, 0, bmapHeight, bmap,
-			dibInfo,
-			DIB_RGB_COLORS);
-	/* VERIFY(res); */
-	
-	/* Create the GL object */
-	glNewList(i + listBase, GL_COMPILE);
-	glBitmap(bmapWidth, bmapHeight, 0.0, (GLfloat)metric.tmDescent,
-		 (GLfloat)charWidth, 0.0,
-		 bmap);
-	glEndList();
-	/* CheckGL(); */
-	
-	/* Destroy the bmap object */
-	DeleteObject(bitObject);
-	
-	/* Deallocate the bitmap data */
-	free(bmap);
-    }
-    
-    /* Destroy the DC */
-    VERIFY(DeleteDC(bitDevice));
-    
-    free(dibInfo);
-    
-    return TRUE;
-#undef VERIFY
-}
-
-WINGDIAPI BOOL GLAPIENTRY wglUseFontBitmapsA(HDC hdc, DWORD first,
-					     DWORD count, DWORD listBase)
-{
-    int i;
-    GLuint font_list;
-    DWORD size;
-    GLYPHMETRICS gm;
-    HANDLE hBits;
-    LPSTR lpBits;
-    MAT2 mat;
-    int  success = TRUE;
-    
-    if (count == 0)
-	return FALSE;
-    
-    font_list = listBase;
-    
-    mat.eM11 = FixedFromDouble(1);
-    mat.eM12 = FixedFromDouble(0);
-    mat.eM21 = FixedFromDouble(0);
-    mat.eM22 = FixedFromDouble(-1);
-    
-    memset(&gm,0,sizeof(gm));
-    
-    /*
-    ** If we can't get the glyph outline, it may be because this is a fixed
-    ** font.  Try processing it that way.
-    */
-    if( GetGlyphOutline(hdc, first, GGO_BITMAP, &gm, 0, NULL, &mat)
-	== GDI_ERROR ) {
-	return wglUseFontBitmaps_FX( hdc, first, count, listBase );
-    }
-    
-    /*
-    ** Otherwise process all desired characters.
-    */
-    for (i = 0; i < (int)count; i++) {
-	DWORD err;
-	
-	glNewList( font_list+i, GL_COMPILE );
-	
-	/* allocate space for the bitmap/outline */
-	size = GetGlyphOutline(hdc, first + i, GGO_BITMAP, 
-			       &gm, 0, NULL, &mat);
-	if (size == GDI_ERROR) {
-	    glEndList( );
-	    err = GetLastError();
-	    success = FALSE;
-	    continue;
-	}
-	
-	hBits  = GlobalAlloc(GHND, size+1);
-	lpBits = GlobalLock(hBits);
-	
-	err = 
-	    GetGlyphOutline(hdc,         /* handle to device context */
-			    first + i,   /* character to query */
-			    GGO_BITMAP,  /* format of data to return */
-			    &gm,         /* ptr to structure for metrics*/
-			    size,        /* size of buffer for data */
-			    lpBits,      /* pointer to buffer for data */
-			    &mat         /* pointer to transformation */
-			    /* matrix structure */
-		);
-	
-	if (err == GDI_ERROR) {
-	    GlobalUnlock(hBits);
-	    GlobalFree(hBits);
-	    
-	    glEndList( );
-	    err = GetLastError();
-	    success = FALSE;
-	    continue;
-	}
-	
-	glBitmap(gm.gmBlackBoxX,gm.gmBlackBoxY,
-		 (GLfloat)-gm.gmptGlyphOrigin.x,
-		 (GLfloat)gm.gmptGlyphOrigin.y,
-		 (GLfloat)gm.gmCellIncX,
-		 (GLfloat)gm.gmCellIncY,
-		 (const GLubyte * )lpBits);
-	
-	GlobalUnlock(hBits);
-	GlobalFree(hBits);
-	
-	glEndList( );
-    }
-    
-    return success;
-}
-
-
-
-/* NOT IMPLEMENTED YET */
-WINGDIAPI BOOL GLAPIENTRY wglCopyContext(HGLRC hglrcSrc,
-					 HGLRC hglrcDst,
-					 UINT mask)
-{
-    (void) hglrcSrc; (void) hglrcDst; (void) mask;
-    return(FALSE);
-}
-
-WINGDIAPI HGLRC GLAPIENTRY wglCreateLayerContext(HDC hdc,
-						 int iLayerPlane)
-{
-    (void) hdc; (void) iLayerPlane;
-    SetLastError(0);
-    return(NULL);
-}
-
-WINGDIAPI BOOL GLAPIENTRY wglShareLists(HGLRC hglrc1,
-					HGLRC hglrc2)
-{
-    (void) hglrc1; (void) hglrc2;
-    return(TRUE);
-}
-
-
-WINGDIAPI BOOL GLAPIENTRY wglUseFontBitmapsW(HDC hdc,
-					     DWORD first,
-					     DWORD count,
-					     DWORD listBase)
-{
-    (void) hdc; (void) first; (void) count; (void) listBase;
-    return FALSE;
-}
-
-WINGDIAPI BOOL GLAPIENTRY wglUseFontOutlinesA(HDC hdc,
-					      DWORD first,
-					      DWORD count,
-					      DWORD listBase,
-					      FLOAT deviation,
-					      FLOAT extrusion,
-					      int format,
-					      LPGLYPHMETRICSFLOAT lpgmf)
-{
-    (void) hdc; (void) first; (void) count;
-    (void) listBase; (void) deviation; (void) extrusion; (void) format;
-    (void) lpgmf;
-    SetLastError(0);
-    return(FALSE);
-}
-
-WINGDIAPI BOOL GLAPIENTRY wglUseFontOutlinesW(HDC hdc,
-					      DWORD first,
-					      DWORD count,
-					      DWORD listBase,
-					      FLOAT deviation,
-					      FLOAT extrusion,
-					      int format,
-					      LPGLYPHMETRICSFLOAT lpgmf)
-{
-    (void) hdc; (void) first; (void) count;
-    (void) listBase; (void) deviation; (void) extrusion; (void) format;
-    (void) lpgmf;
-    SetLastError(0);
-    return(FALSE);
-}
-
-WINGDIAPI BOOL GLAPIENTRY wglDescribeLayerPlane(HDC hdc,
-						int iPixelFormat,
-						int iLayerPlane,
-						UINT nBytes,
-						LPLAYERPLANEDESCRIPTOR plpd)
-{
-    (void) hdc; (void) iPixelFormat; (void) iLayerPlane; 
-    (void) nBytes; (void) plpd;
-    SetLastError(0);
-    return(FALSE);
-}
-
-WINGDIAPI int GLAPIENTRY wglSetLayerPaletteEntries(HDC hdc,
-						   int iLayerPlane,
-						   int iStart,
-						   int cEntries,
-						   CONST COLORREF *pcr)
-{
-    (void) hdc; (void) iLayerPlane; (void) iStart; 
-    (void) cEntries; (void) pcr;
-    SetLastError(0);
-    return(0);
-}
-
-WINGDIAPI int GLAPIENTRY wglGetLayerPaletteEntries(HDC hdc,
-						   int iLayerPlane,
-						   int iStart,
-						   int cEntries,
-						   COLORREF *pcr)
-{
-    (void) hdc; (void) iLayerPlane; (void) iStart; (void) cEntries; (void) pcr;
-    SetLastError(0);
-    return(0);
-}
-
-WINGDIAPI BOOL GLAPIENTRY wglRealizeLayerPalette(HDC hdc,
-						 int iLayerPlane,
-						 BOOL bRealize)
-{
-    (void) hdc; (void) iLayerPlane; (void) bRealize;
-    SetLastError(0);
-    return(FALSE);
-}
-
-WINGDIAPI BOOL GLAPIENTRY wglSwapLayerBuffers(HDC hdc,
-					      UINT fuPlanes)
-{
-    (void) hdc; (void) fuPlanes;
-    SetLastError(0);
-    return(FALSE);
-}
-
-WINGDIAPI const char * GLAPIENTRY wglGetExtensionsStringARB(HDC hdc)
-{
-    return "WGL_ARB_extensions_string";
-}
diff --git a/src/gallium/winsys/gdi/wmesa.c b/src/gallium/winsys/gdi/wmesa.c
deleted file mode 100644
index ed3dd2b9277..00000000000
--- a/src/gallium/winsys/gdi/wmesa.c
+++ /dev/null
@@ -1,823 +0,0 @@
-/*
- * Windows (Win32/Win64) device driver for Mesa
- *
- */
-
-#include "mtypes.h"
-#include <GL/wmesa.h>
-#include "wmesadef.h"
-
-#undef Elements
-
-#include "pipe/p_winsys.h"
-#include "pipe/p_format.h"
-#include "pipe/p_context.h"
-#include "pipe/p_inlines.h"
-#include "util/u_memory.h"
-#include "softpipe/sp_winsys.h"
-#include "glapi/glapi.h"
-#include "colors.h"
-
-extern GLvisual *
-_mesa_create_visual( GLboolean rgbFlag,
-                     GLboolean dbFlag,
-                     GLboolean stereoFlag,
-                     GLint redBits,
-                     GLint greenBits,
-                     GLint blueBits,
-                     GLint alphaBits,
-                     GLint indexBits,
-                     GLint depthBits,
-                     GLint stencilBits,
-                     GLint accumRedBits,
-                     GLint accumGreenBits,
-                     GLint accumBlueBits,
-                     GLint accumAlphaBits,
-                     GLint numSamples );
-
-/* linked list of our Framebuffers (windows) */
-WMesaFramebuffer FirstFramebuffer = NULL;
-
-struct wmesa_pipe_winsys
-{
-   struct pipe_winsys base;
-};
-
-/**
- * Choose the pixel format for the given visual.
- * This will tell the gallium driver how to pack pixel data into
- * drawing surfaces.
- */
-static GLuint
-choose_pixel_format(GLvisual *v)
-{
-#if 1
-   return PIPE_FORMAT_A8R8G8B8_UNORM;
-#else
-   if (   GET_REDMASK(v)   == 0x0000ff
-       && GET_GREENMASK(v) == 0x00ff00
-       && GET_BLUEMASK(v)  == 0xff0000
-       && v->BitsPerPixel == 32) {
-      if (CHECK_BYTE_ORDER(v)) {
-         /* no byteswapping needed */
-         return 0 /* PIXEL_FORMAT_U_A8_B8_G8_R8 */;
-      }
-      else {
-         return PIPE_FORMAT_R8G8B8A8_UNORM;
-      }
-   }
-   else if (   GET_REDMASK(v)   == 0xff0000
-            && GET_GREENMASK(v) == 0x00ff00
-            && GET_BLUEMASK(v)  == 0x0000ff
-            && v->BitsPerPixel == 32) {
-      if (CHECK_BYTE_ORDER(v)) {
-         /* no byteswapping needed */
-         return PIPE_FORMAT_A8R8G8B8_UNORM;
-      }
-      else {
-         return PIPE_FORMAT_B8G8R8A8_UNORM;
-      }
-   }
-   else if (   GET_REDMASK(v)   == 0xf800
-            && GET_GREENMASK(v) == 0x07e0
-            && GET_BLUEMASK(v)  == 0x001f
-            && CHECK_BYTE_ORDER(v)
-            && v->BitsPerPixel == 16) {
-      /* 5-6-5 RGB */
-      return PIPE_FORMAT_R5G6B5_UNORM;
-   }
-
-printf("BITS %d\n",v->BitsPerPixel);
-   assert(0);
-   return 0;
-#endif
-}
-
-/*
- * Determine the pixel format based on the pixel size.
- */
-static void wmSetPixelFormat(WMesaFramebuffer pwfb, HDC hDC)
-{
-    /* Only 16 and 32 bit targets are supported now */
-    assert(pwfb->cColorBits == 0 ||
-	   pwfb->cColorBits == 16 || 
-	   pwfb->cColorBits == 32);
-
-    switch(pwfb->cColorBits){
-    case 8:
-	pwfb->pixelformat = PF_INDEX8;
-	break;
-    case 16:
-	pwfb->pixelformat = PF_5R6G5B;
-	break;
-    case 32:
-	pwfb->pixelformat = PF_8R8G8B;
-	break;
-    default:
-	pwfb->pixelformat = PF_BADFORMAT;
-    }
-}
-
-/**
- * Create a new WMesaFramebuffer object which will correspond to the
- * given HDC (Window handle).
- */
-WMesaFramebuffer
-wmesa_new_framebuffer(HDC hdc, GLvisual *visual, GLuint width, GLuint height)
-{
-    WMesaFramebuffer pwfb
-        = (WMesaFramebuffer) malloc(sizeof(struct wmesa_framebuffer));
-    if (pwfb) {
-   	enum pipe_format colorFormat, depthFormat, stencilFormat;
-
-   	/* determine PIPE_FORMATs for buffers */
-   	colorFormat = choose_pixel_format(visual);
-
-   	if (visual->depthBits == 0)
-      		depthFormat = PIPE_FORMAT_NONE;
-	else if (visual->depthBits <= 16)
-      		depthFormat = PIPE_FORMAT_Z16_UNORM;
-	else if (visual->depthBits <= 24)
-      		depthFormat = PIPE_FORMAT_S8Z24_UNORM;
-	else
-      		depthFormat = PIPE_FORMAT_Z32_UNORM;
-
-	if (visual->stencilBits == 8) {
-      		if (depthFormat == PIPE_FORMAT_S8Z24_UNORM)
-			stencilFormat = depthFormat;
-		else
-			stencilFormat = PIPE_FORMAT_S8_UNORM;
-   	}
-   	else {
-      		stencilFormat = PIPE_FORMAT_NONE;
-   	}
-
-   	pwfb->stfb = st_create_framebuffer(visual,
-                                   colorFormat, depthFormat, stencilFormat,
-                                   width, height,
-                                   (void *) pwfb);
-
-        pwfb->cColorBits = GetDeviceCaps(hdc, BITSPIXEL);
-
-        pwfb->hDC = hdc;
-        /* insert at head of list */
-	pwfb->next = FirstFramebuffer;
-        FirstFramebuffer = pwfb;
-    }
-    return pwfb;
-}
-
-/**
- * Given an hdc, free the corresponding WMesaFramebuffer
- */
-void
-wmesa_free_framebuffer(HDC hdc)
-{
-    WMesaFramebuffer pwfb, prev;
-    for (pwfb = FirstFramebuffer; pwfb; pwfb = pwfb->next) {
-        if (pwfb->hDC == hdc)
-            break;
-	prev = pwfb;
-    }
-    if (pwfb) {
-	if (pwfb == FirstFramebuffer)
-	    FirstFramebuffer = pwfb->next;
-	else
-	    prev->next = pwfb->next;
-	free(pwfb);
-    }
-}
-
-/**
- * Given an hdc, return the corresponding WMesaFramebuffer
- */
-WMesaFramebuffer
-wmesa_lookup_framebuffer(HDC hdc)
-{
-    WMesaFramebuffer pwfb;
-    for (pwfb = FirstFramebuffer; pwfb; pwfb = pwfb->next) {
-        if (pwfb->hDC == hdc)
-            return pwfb;
-    }
-    return NULL;
-}
-
-
-/**
- * Given a GLframebuffer, return the corresponding WMesaFramebuffer.
- */
-static WMesaFramebuffer wmesa_framebuffer(GLframebuffer *fb)
-{
-    return (WMesaFramebuffer) fb;
-}
-
-
-/**
- * Given a GLcontext, return the corresponding WMesaContext.
- */
-static WMesaContext wmesa_context(const GLcontext *ctx)
-{
-    return (WMesaContext) ctx;
-}
-
-/**
- * Find the width and height of the window named by hdc.
- */
-static void
-get_window_size(HDC hdc, GLuint *width, GLuint *height)
-{
-    if (WindowFromDC(hdc)) {
-        RECT rect;
-        GetClientRect(WindowFromDC(hdc), &rect);
-        *width = rect.right - rect.left;
-        *height = rect.bottom - rect.top;
-    }
-    else { /* Memory context */
-        /* From contributed code - use the size of the desktop
-         * for the size of a memory context (?) */
-        *width = GetDeviceCaps(hdc, HORZRES);
-        *height = GetDeviceCaps(hdc, VERTRES);
-    }
-}
-
-/**
- * Low-level OS/window system memory buffer
- */
-struct wm_buffer
-{
-   struct pipe_buffer base;
-   boolean userBuffer;  /** Is this a user-space buffer? */
-   void *data;
-   void *mapped;
-};
-
-struct wmesa_surface
-{
-   struct pipe_surface surface;
-
-   int no_swap;
-};
-
-
-/** Cast wrapper */
-static INLINE struct wmesa_surface *
-wmesa_surface(struct pipe_surface *ps)
-{
-//   assert(0);
-   return (struct wmesa_surface *) ps;
-}
-
-/**
- * Turn the softpipe opaque buffer pointer into a dri_bufmgr opaque
- * buffer pointer...
- */
-static INLINE struct wm_buffer *
-wm_buffer( struct pipe_buffer *buf )
-{
-   return (struct wm_buffer *)buf;
-}
-
-
-
-/* Most callbacks map direcly onto dri_bufmgr operations:
- */
-static void *
-wm_buffer_map(struct pipe_winsys *pws, struct pipe_buffer *buf,
-              unsigned flags)
-{
-   struct wm_buffer *wm_buf = wm_buffer(buf);
-   wm_buf->mapped = wm_buf->data;
-   return wm_buf->mapped;
-}
-
-static void
-wm_buffer_unmap(struct pipe_winsys *pws, struct pipe_buffer *buf)
-{
-   struct wm_buffer *wm_buf = wm_buffer(buf);
-   wm_buf->mapped = NULL;
-}
-
-static void
-wm_buffer_destroy(struct pipe_winsys *pws,
-                  struct pipe_buffer *buf)
-{
-   struct wm_buffer *oldBuf = wm_buffer(buf);
-
-   if (oldBuf->data) {
-      {
-         if (!oldBuf->userBuffer) {
-            align_free(oldBuf->data);
-         }
-      }
-
-      oldBuf->data = NULL;
-   }
-
-   free(oldBuf);
-}
-
-
-static void
-wm_flush_frontbuffer(struct pipe_winsys *pws,
-                     struct pipe_surface *surf,
-                     void *context_private)
-{
-    WMesaContext pwc = context_private;
-    WMesaFramebuffer pwfb = wmesa_lookup_framebuffer(pwc->hDC);
-   struct wm_buffer *wm_buf;
-    BITMAPINFO bmi, *pbmi;
-
-    wm_buf = wm_buffer(surf->buffer);
-
-    pbmi = &bmi;
-    memset(pbmi, 0, sizeof(BITMAPINFO));
-    pbmi->bmiHeader.biSize = sizeof(BITMAPINFOHEADER);
-    pbmi->bmiHeader.biWidth = pwfb->stfb->Base.Width;
-    pbmi->bmiHeader.biHeight= -((long)pwfb->stfb->Base.Height);
-    pbmi->bmiHeader.biPlanes = 1;
-    pbmi->bmiHeader.biBitCount = pwfb->cColorBits;
-    pbmi->bmiHeader.biCompression = BI_RGB;
-    pbmi->bmiHeader.biSizeImage = 0;
-    pbmi->bmiHeader.biXPelsPerMeter = 0;
-    pbmi->bmiHeader.biYPelsPerMeter = 0;
-    pbmi->bmiHeader.biClrUsed = 0;
-    pbmi->bmiHeader.biClrImportant = 0;
-
-    StretchDIBits(pwfb->hDC, 0, 0, pwfb->stfb->Base.Width, pwfb->stfb->Base.Height, 0, 0, pwfb->stfb->Base.Width, pwfb->stfb->Base.Height, wm_buf->data, pbmi, 0, SRCCOPY);
-}
-
-
-
-static const char *
-wm_get_name(struct pipe_winsys *pws)
-{
-   return "gdi";
-}
-
-static struct pipe_buffer *
-wm_buffer_create(struct pipe_winsys *pws, 
-                 unsigned alignment, 
-                 unsigned usage,
-                 unsigned size)
-{
-   struct wm_buffer *buffer = CALLOC_STRUCT(wm_buffer);
-
-   buffer->base.refcount = 1;
-   buffer->base.alignment = alignment;
-   buffer->base.usage = usage;
-   buffer->base.size = size;
-
-   if (buffer->data == NULL) {
-      /* align to 16-byte multiple for Cell */
-      buffer->data = align_malloc(size, max(alignment, 16));
-   }
-
-   return &buffer->base;
-}
-
-
-/**
- * Create buffer which wraps user-space data.
- */
-static struct pipe_buffer *
-wm_user_buffer_create(struct pipe_winsys *pws, void *ptr, unsigned bytes)
-{
-   struct wm_buffer *buffer = CALLOC_STRUCT(wm_buffer);
-   buffer->base.refcount = 1;
-   buffer->base.size = bytes;
-   buffer->userBuffer = TRUE;
-   buffer->data = ptr;
-
-   return &buffer->base;
-}
-
-
-
-/**
- * Round n up to next multiple.
- */
-static INLINE unsigned
-round_up(unsigned n, unsigned multiple)
-{
-   return (n + multiple - 1) & ~(multiple - 1);
-}
-
-static int
-wm_surface_alloc_storage(struct pipe_winsys *winsys,
-                         struct pipe_surface *surf,
-                         unsigned width, unsigned height,
-                         enum pipe_format format, 
-                         unsigned flags,
-                         unsigned tex_usage)
-{
-   const unsigned alignment = 64;
-
-   surf->width = width;
-   surf->height = height;
-   surf->format = format;
-   pf_get_block(format, &surf->block);
-   surf->nblocksx = pf_get_nblocksx(&surf->block, width);
-   surf->nblocksy = pf_get_nblocksy(&surf->block, height);
-   surf->stride = round_up(surf->nblocksx * surf->block.size, alignment);
-
-   assert(!surf->buffer);
-   surf->buffer = winsys->buffer_create(winsys, alignment,
-                                        PIPE_BUFFER_USAGE_PIXEL,
-                                        surf->nblocksy * surf->stride);
-   if(!surf->buffer)
-      return -1;
-   
-   return 0;
-}
-
-
-/**
- * Called via winsys->surface_alloc() to create new surfaces.
- */
-static struct pipe_surface *
-wm_surface_alloc(struct pipe_winsys *ws)
-{
-   struct wmesa_surface *wms = CALLOC_STRUCT(wmesa_surface);
-   static boolean no_swap = 0;
-   static boolean firsttime = 1;
-
-   if (firsttime) {
-      no_swap = getenv("SP_NO_RAST") != NULL;
-      firsttime = 0;
-   }
-
-   assert(ws);
-
-   wms->surface.refcount = 1;
-   wms->surface.winsys = ws;
-
-   wms->no_swap = no_swap;
-   
-   return &wms->surface;
-}
-
-static void
-wm_surface_release(struct pipe_winsys *winsys, struct pipe_surface **s)
-{
-   struct pipe_surface *surf = *s;
-   surf->refcount--;
-   if (surf->refcount == 0) {
-      if (surf->buffer)
-	winsys_buffer_reference(winsys, &surf->buffer, NULL);
-      free(surf);
-   }
-   *s = NULL;
-}
-
-
-/*
- * Fence functions - basically nothing to do, as we don't create any actual
- * fence objects.
- */
-
-static void
-wm_fence_reference(struct pipe_winsys *sws, struct pipe_fence_handle **ptr,
-                   struct pipe_fence_handle *fence)
-{
-}
-
-
-static int
-wm_fence_signalled(struct pipe_winsys *sws, struct pipe_fence_handle *fence,
-                   unsigned flag)
-{
-   return 0;
-}
-
-
-static int
-wm_fence_finish(struct pipe_winsys *sws, struct pipe_fence_handle *fence,
-                unsigned flag)
-{
-   return 0;
-}
-
-
-
-struct pipe_winsys *
-wmesa_get_pipe_winsys(GLvisual *visual)
-{
-   static struct wmesa_pipe_winsys *ws = NULL;
-
-   if (!ws) {
-      ws = CALLOC_STRUCT(wmesa_pipe_winsys);
-
-      /* Fill in this struct with callbacks that pipe will need to
-       * communicate with the window system, buffer manager, etc. 
-       */
-      ws->base.buffer_create = wm_buffer_create;
-      ws->base.user_buffer_create = wm_user_buffer_create;
-      ws->base.buffer_map = wm_buffer_map;
-      ws->base.buffer_unmap = wm_buffer_unmap;
-      ws->base.buffer_destroy = wm_buffer_destroy;
-
-      ws->base.surface_alloc = wm_surface_alloc;
-      ws->base.surface_alloc_storage = wm_surface_alloc_storage;
-      ws->base.surface_release = wm_surface_release;
-
-      ws->base.fence_reference = wm_fence_reference;
-      ws->base.fence_signalled = wm_fence_signalled;
-      ws->base.fence_finish = wm_fence_finish;
-
-      ws->base.flush_frontbuffer = wm_flush_frontbuffer;
-      ws->base.get_name = wm_get_name;
-   }
-
-   return &ws->base;
-}
-
-
-
-/**********************************************************************/
-/*****                   WMESA Functions                          *****/
-/**********************************************************************/
-
-WMesaContext WMesaCreateContext(HDC hDC, 
-				HPALETTE* Pal,
-				GLboolean rgb_flag,
-				GLboolean db_flag,
-				GLboolean alpha_flag)
-{
-   WMesaContext c;
-   struct pipe_winsys *pws;
-   struct pipe_context *pipe;
-   struct pipe_screen *screen;
-   GLint red_bits, green_bits, blue_bits, alpha_bits;
-   GLvisual *visual;
-
-   (void) Pal;
-    
-   /* Indexed mode not supported */
-   if (!rgb_flag)
-	return NULL;
-
-   /* Allocate wmesa context */
-   c = CALLOC_STRUCT(wmesa_context);
-   if (!c)
-	return NULL;
-
-   c->hDC = hDC;
-
-    /* Get data for visual */
-    /* Dealing with this is actually a bit of overkill because Mesa will end
-     * up treating all color component size requests less than 8 by using 
-     * a single byte per channel.  In addition, the interface to the span
-     * routines passes colors as an entire byte per channel anyway, so there
-     * is nothing to be saved by telling the visual to be 16 bits if the device
-     * is 16 bits.  That is, Mesa is going to compute colors down to 8 bits per
-     * channel anyway.
-     * But we go through the motions here anyway.
-     */
-    c->cColorBits = GetDeviceCaps(c->hDC, BITSPIXEL);
-
-    switch (c->cColorBits) {
-    case 16:
-	red_bits = green_bits = blue_bits = 5;
-	alpha_bits = 0;
-	break;
-    default:
-	red_bits = green_bits = blue_bits = 8;
-	alpha_bits = 8;
-	break;
-    }
-    /* Create visual based on flags */
-    visual = _mesa_create_visual(rgb_flag,
-                                 db_flag,    /* db_flag */
-                                 GL_FALSE,   /* stereo */
-                                 red_bits, green_bits, blue_bits, /* color RGB */
-                                 alpha_flag ? alpha_bits : 0, /* color A */
-                                 0,          /* index bits */
-                                 DEFAULT_SOFTWARE_DEPTH_BITS, /* depth_bits */
-                                 8,          /* stencil_bits */
-                                 16,16,16,   /* accum RGB */
-                                 alpha_flag ? 16 : 0, /* accum A */
-                                 1);         /* num samples */
-    
-    if (!visual) {
-	_mesa_free(c);
-	return NULL;
-    }
-
-    pws = wmesa_get_pipe_winsys(visual);
-
-    screen = softpipe_create_screen(pws);
-
-    if (!screen) {
-        _mesa_free(c);
-	return NULL;
-   }
-
-   pipe = softpipe_create(screen, pws, NULL);
-
-   if (!pipe) {
-      /* FIXME - free screen */
-      _mesa_free(c);
-      return NULL;
-   }
-
-   pipe->priv = c;
-
-   c->st = st_create_context(pipe, visual, NULL);
-
-   c->st->ctx->DriverCtx = c;
-
-   return c;
-}
-
-
-void WMesaDestroyContext( WMesaContext pwc )
-{
-    GLcontext *ctx = pwc->st->ctx;
-    WMesaFramebuffer pwfb;
-    GET_CURRENT_CONTEXT(cur_ctx);
-
-    if (cur_ctx == ctx) {
-        /* unbind current if deleting current context */
-        WMesaMakeCurrent(NULL, NULL);
-    }
-
-    /* clean up frame buffer resources */
-    pwfb = wmesa_lookup_framebuffer(pwc->hDC);
-    if (pwfb) {
-	wmesa_free_framebuffer(pwc->hDC);
-    }
-
-    /* Release for device, not memory contexts */
-    if (WindowFromDC(pwc->hDC) != NULL)
-    {
-      ReleaseDC(WindowFromDC(pwc->hDC), pwc->hDC);
-    }
-    
-    st_destroy_context(pwc->st);
-    _mesa_free(pwc);
-}
-
-
-void WMesaMakeCurrent(WMesaContext c, HDC hdc)
-{
-    GLuint width = 0, height = 0;
-    WMesaFramebuffer pwfb;
-
-    {
-        /* return if already current */
-        GET_CURRENT_CONTEXT(ctx);
-        WMesaContext pwc = wmesa_context(ctx);
-        if (pwc && c == pwc && pwc->hDC == hdc)
-            return;
-    }
-
-    pwfb = wmesa_lookup_framebuffer(hdc);
-
-    if (hdc) {
-        get_window_size(hdc, &width, &height);
-    }
-
-    /* Lazy creation of framebuffers */
-    if (c && !pwfb && (hdc != 0)) {
-        GLvisual *visual = &c->st->ctx->Visual;
-
-        pwfb = wmesa_new_framebuffer(hdc, visual, width, height);
-    }
-
-    if (c && pwfb) {
-      	st_make_current(c->st, pwfb->stfb, pwfb->stfb);
-
-	st_resize_framebuffer(pwfb->stfb, width, height);
-   }
-   else {
-      /* Detach */
-      st_make_current( NULL, NULL, NULL );
-   }
-}
-
-
-void WMesaSwapBuffers( HDC hdc )
-{
-   struct pipe_surface *surf;
-   struct wm_buffer *wm_buf;
-    WMesaFramebuffer pwfb = wmesa_lookup_framebuffer(hdc);
-    BITMAPINFO bmi, *pbmi;
-
-    if (!pwfb) {
-        _mesa_problem(NULL, "wmesa: swapbuffers on unknown hdc");
-        return;
-    }
-
-
-    /* If we're swapping the buffer associated with the current context
-     * we have to flush any pending rendering commands first.
-     */
-    st_notify_swapbuffers(pwfb->stfb);
-
-    surf = st_get_framebuffer_surface(pwfb->stfb, ST_SURFACE_BACK_LEFT);
-    wm_buf = wm_buffer(surf->buffer);
-
-    pbmi = &bmi;
-    memset(pbmi, 0, sizeof(BITMAPINFO));
-    pbmi->bmiHeader.biSize = sizeof(BITMAPINFOHEADER);
-    pbmi->bmiHeader.biWidth = pwfb->stfb->Base.Width;
-    pbmi->bmiHeader.biHeight= -((long)pwfb->stfb->Base.Height);
-    pbmi->bmiHeader.biPlanes = 1;
-    pbmi->bmiHeader.biBitCount = pwfb->cColorBits;
-    pbmi->bmiHeader.biCompression = BI_RGB;
-    pbmi->bmiHeader.biSizeImage = 0;
-    pbmi->bmiHeader.biXPelsPerMeter = 0;
-    pbmi->bmiHeader.biYPelsPerMeter = 0;
-    pbmi->bmiHeader.biClrUsed = 0;
-    pbmi->bmiHeader.biClrImportant = 0;
-
-    StretchDIBits(pwfb->hDC, 0, 0, pwfb->stfb->Base.Width, pwfb->stfb->Base.Height, 0, 0, pwfb->stfb->Base.Width, pwfb->stfb->Base.Height, wm_buf->data, pbmi, 0, SRCCOPY);
-
-    {
-        GLuint width = 0, height = 0;
-
-        get_window_size(pwfb->hDC, &width, &height);
-
-	st_resize_framebuffer(pwfb->stfb, width, height);
-    }
-}
-
-/* This is hopefully a temporary hack to define some needed dispatch
- * table entries.  Hopefully, I'll find a better solution.  The
- * dispatch table generation scripts ought to be making these dummy
- * stubs as well. */
-#if !defined(__MINGW32__) || !defined(GL_NO_STDCALL)
-void gl_dispatch_stub_543(void){}
-void gl_dispatch_stub_544(void){}
-void gl_dispatch_stub_545(void){}
-void gl_dispatch_stub_546(void){}
-void gl_dispatch_stub_547(void){}
-void gl_dispatch_stub_548(void){}
-void gl_dispatch_stub_549(void){}
-void gl_dispatch_stub_550(void){}
-void gl_dispatch_stub_551(void){}
-void gl_dispatch_stub_552(void){}
-void gl_dispatch_stub_553(void){}
-void gl_dispatch_stub_554(void){}
-void gl_dispatch_stub_555(void){}
-void gl_dispatch_stub_556(void){}
-void gl_dispatch_stub_557(void){}
-void gl_dispatch_stub_558(void){}
-void gl_dispatch_stub_559(void){}
-void gl_dispatch_stub_560(void){}
-void gl_dispatch_stub_561(void){}
-void gl_dispatch_stub_565(void){}
-void gl_dispatch_stub_566(void){}
-void gl_dispatch_stub_577(void){}
-void gl_dispatch_stub_578(void){}
-void gl_dispatch_stub_603(void){}
-void gl_dispatch_stub_645(void){}
-void gl_dispatch_stub_646(void){}
-void gl_dispatch_stub_647(void){}
-void gl_dispatch_stub_648(void){}
-void gl_dispatch_stub_649(void){}
-void gl_dispatch_stub_650(void){}
-void gl_dispatch_stub_651(void){}
-void gl_dispatch_stub_652(void){}
-void gl_dispatch_stub_653(void){}
-void gl_dispatch_stub_733(void){}
-void gl_dispatch_stub_734(void){}
-void gl_dispatch_stub_735(void){}
-void gl_dispatch_stub_736(void){}
-void gl_dispatch_stub_737(void){}
-void gl_dispatch_stub_738(void){}
-void gl_dispatch_stub_744(void){}
-void gl_dispatch_stub_745(void){}
-void gl_dispatch_stub_746(void){}
-void gl_dispatch_stub_760(void){}
-void gl_dispatch_stub_761(void){}
-void gl_dispatch_stub_763(void){}
-void gl_dispatch_stub_765(void){}
-void gl_dispatch_stub_766(void){}
-void gl_dispatch_stub_767(void){}
-void gl_dispatch_stub_768(void){}
-
-void gl_dispatch_stub_562(void){}
-void gl_dispatch_stub_563(void){}
-void gl_dispatch_stub_564(void){}
-void gl_dispatch_stub_567(void){}
-void gl_dispatch_stub_568(void){}
-void gl_dispatch_stub_569(void){}
-void gl_dispatch_stub_580(void){}
-void gl_dispatch_stub_581(void){}
-void gl_dispatch_stub_606(void){}
-void gl_dispatch_stub_654(void){}
-void gl_dispatch_stub_655(void){}
-void gl_dispatch_stub_656(void){}
-void gl_dispatch_stub_739(void){}
-void gl_dispatch_stub_740(void){}
-void gl_dispatch_stub_741(void){}
-void gl_dispatch_stub_748(void){}
-void gl_dispatch_stub_749(void){}
-void gl_dispatch_stub_769(void){}
-void gl_dispatch_stub_770(void){}
-void gl_dispatch_stub_771(void){}
-void gl_dispatch_stub_772(void){}
-void gl_dispatch_stub_773(void){}
-
-#endif
diff --git a/src/gallium/winsys/gdi/wmesadef.h b/src/gallium/winsys/gdi/wmesadef.h
deleted file mode 100644
index fb8ce30a082..00000000000
--- a/src/gallium/winsys/gdi/wmesadef.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef WMESADEF_H
-#define WMESADEF_H
-#ifdef __MINGW32__
-#include <windows.h>
-#endif
-#if 0
-#include "context.h"
-#endif
-#include "state_tracker/st_context.h"
-#include "state_tracker/st_public.h"
-
-
-/**
- * The Windows Mesa rendering context, derived from GLcontext.
- */
-struct wmesa_context {
-    struct st_context 	*st;
-    HDC                 hDC;
-    BYTE		cColorBits;
-};
-
-/**
- * Windows framebuffer, derived from gl_framebuffer
- */
-struct wmesa_framebuffer
-{
-    struct st_framebuffer *stfb;
-    HDC                 hDC;
-    int			pixelformat;
-    BYTE		cColorBits;
-    HDC                 dib_hDC;
-    HBITMAP             hbmDIB;
-    HBITMAP             hOldBitmap;
-    PBYTE               pbPixels;
-    struct wmesa_framebuffer *next;
-};
-
-typedef struct wmesa_framebuffer *WMesaFramebuffer;
-
-#endif /* WMESADEF_H */
diff --git a/src/gallium/winsys/xlib/SConscript b/src/gallium/winsys/xlib/SConscript
index 324fbef306a..3aef3b6cedd 100644
--- a/src/gallium/winsys/xlib/SConscript
+++ b/src/gallium/winsys/xlib/SConscript
@@ -5,8 +5,7 @@ Import('*')
 
 if env['platform'] == 'linux' \
         and 'mesa' in env['statetrackers'] \
-        and 'softpipe' in env['drivers'] \
-        and 'i965simple' in env['drivers'] \
+        and ('softpipe' or 'i915simple' or 'trace') in env['drivers'] \
         and not env['dri']:
 
     env = env.Clone()
@@ -22,15 +21,20 @@ if env['platform'] == 'linux' \
         'xfonts.c',
         'xm_api.c',
         'xm_winsys.c',
-        'xm_winsys_aub.c',
-        'brw_aub.c',
     ]
+
+    drivers = [];
+        
+    if 'softpipe' in env['drivers']:
+        drivers += [softpipe]
+
+    if 'i965simple' in env['drivers']:
+        drivers += [i965simple]
+        sources += [
+            'brw_aub.c',
+            'xm_winsys_aub.c',
+            ]
         
-    drivers = [
-        softpipe,
-        i965simple,
-    ]
-    
     if 'trace' in env['drivers']:
         env.Append(CPPDEFINES = 'GALLIUM_TRACE')
         drivers += [trace]
diff --git a/src/gallium/winsys/xlib/xm_winsys.c b/src/gallium/winsys/xlib/xm_winsys.c
index 3334af175bc..acb5ad8f714 100644
--- a/src/gallium/winsys/xlib/xm_winsys.c
+++ b/src/gallium/winsys/xlib/xm_winsys.c
@@ -352,7 +352,7 @@ xmesa_display_surface_tiled(XMesaBuffer b, const struct pipe_surface *surf)
          /* offset in pixels */
          offset *= TILE_SIZE * TILE_SIZE;
 
-         if (XSHM_ENABLED(xm_buf)) {
+         if (0 && XSHM_ENABLED(xm_buf)) {
             ximage->data = (char *) xm_buf->data + 4 * offset;
             /* make copy of tile data */
             memcpy(tmpTile, (uint *) ximage->data, sizeof(tmpTile));
@@ -365,7 +365,7 @@ xmesa_display_surface_tiled(XMesaBuffer b, const struct pipe_surface *surf)
 #endif
          }
          else {
-            /* twiddel from ximage buffer to temp tile */
+            /* twiddle from ximage buffer to temp tile */
             twiddle_tile((uint *) xm_buf->data + offset, tmpTile);
             /* display temp tile data */
             ximage->data = (char *) tmpTile;