From 2f5f7c07732577f60666e3cee69c75c9b035c145 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Fri, 23 Oct 2009 16:55:02 +0100
Subject: i965g: re-starting from the dri driver

---
 src/gallium/drivers/i965/brw_vs.h | 88 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)
 create mode 100644 src/gallium/drivers/i965/brw_vs.h

(limited to 'src/gallium/drivers/i965/brw_vs.h')

diff --git a/src/gallium/drivers/i965/brw_vs.h b/src/gallium/drivers/i965/brw_vs.h
new file mode 100644
index 00000000000..4a591365c98
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_vs.h
@@ -0,0 +1,88 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+            
+
+#ifndef BRW_VS_H
+#define BRW_VS_H
+
+
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "shader/program.h"
+
+
+struct brw_vs_prog_key {
+   GLuint program_string_id;
+   GLuint nr_userclip:4;
+   GLuint copy_edgeflag:1;
+   GLuint pad:26;
+};
+
+
+struct brw_vs_compile {
+   struct brw_compile func;
+   struct brw_vs_prog_key key;
+   struct brw_vs_prog_data prog_data;
+
+   struct brw_vertex_program *vp;
+
+   GLuint nr_inputs;
+
+   GLuint first_output;
+   GLuint nr_outputs;
+   GLuint first_overflow_output; /**< VERT_ATTRIB_x */
+
+   GLuint first_tmp;
+   GLuint last_tmp;
+
+   struct brw_reg r0;
+   struct brw_reg r1;
+   struct brw_reg regs[PROGRAM_ADDRESS+1][128];
+   struct brw_reg tmp;
+   struct brw_reg stack;
+
+   struct {	
+       GLboolean used_in_src;
+       struct brw_reg reg;
+   } output_regs[128];
+
+   struct brw_reg userplane[6];
+
+   /** we may need up to 3 constants per instruction (if use_const_buffer) */
+   struct {
+      GLint index;
+      struct brw_reg reg;
+   } current_const[3];
+};
+
+void brw_vs_emit( struct brw_vs_compile *c );
+
+#endif
-- 
cgit v1.2.3


From 074606a806df755ecbb84e0a1182c66fd0b2a8dd Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Sat, 24 Oct 2009 13:18:34 +0100
Subject: i965g: more files compiling

---
 src/gallium/drivers/i965/brw_batchbuffer.h      | 124 ++++++++++++
 src/gallium/drivers/i965/brw_cc.c               |  16 +-
 src/gallium/drivers/i965/brw_clip.c             |  80 +++-----
 src/gallium/drivers/i965/brw_clip.h             |   7 +-
 src/gallium/drivers/i965/brw_clip_unfilled.c    |   2 +-
 src/gallium/drivers/i965/brw_clip_util.c        |   2 +-
 src/gallium/drivers/i965/brw_context.c          |   2 +-
 src/gallium/drivers/i965/brw_context.h          |  89 ++++-----
 src/gallium/drivers/i965/brw_curbe.c            |  10 +-
 src/gallium/drivers/i965/brw_defines.h          |   4 +-
 src/gallium/drivers/i965/brw_draw.c             |  12 +-
 src/gallium/drivers/i965/brw_draw_upload.c      |   2 +-
 src/gallium/drivers/i965/brw_eu.h               |  32 +++-
 src/gallium/drivers/i965/brw_eu_emit.c          |   4 +-
 src/gallium/drivers/i965/brw_gs.c               |   2 +-
 src/gallium/drivers/i965/brw_gs_emit.c          |   2 +-
 src/gallium/drivers/i965/brw_misc_state.c       |   2 +-
 src/gallium/drivers/i965/brw_pipe_flush.c       |   2 +-
 src/gallium/drivers/i965/brw_pipe_query.c       |   4 +-
 src/gallium/drivers/i965/brw_pipe_rast.c        |  46 +++++
 src/gallium/drivers/i965/brw_pipe_rast.h        |  14 ++
 src/gallium/drivers/i965/brw_pipe_shader.c      | 159 ++++++++++++++++
 src/gallium/drivers/i965/brw_reg.h              |  79 ++++++++
 src/gallium/drivers/i965/brw_screen.h           |  78 ++++++++
 src/gallium/drivers/i965/brw_screen_surface.c   |   4 +-
 src/gallium/drivers/i965/brw_sf.c               |   2 +-
 src/gallium/drivers/i965/brw_sf.h               |   1 -
 src/gallium/drivers/i965/brw_sf_emit.c          |   2 +-
 src/gallium/drivers/i965/brw_state.h            |   2 +-
 src/gallium/drivers/i965/brw_state_batch.c      |   6 +-
 src/gallium/drivers/i965/brw_state_cache.c      |   2 +-
 src/gallium/drivers/i965/brw_state_upload.c     |   2 +-
 src/gallium/drivers/i965/brw_tex_layout.c       |   2 +-
 src/gallium/drivers/i965/brw_urb.c              |   2 +-
 src/gallium/drivers/i965/brw_util.h             |   5 +-
 src/gallium/drivers/i965/brw_vs.c               |   3 +-
 src/gallium/drivers/i965/brw_vs.h               |   1 -
 src/gallium/drivers/i965/brw_vs_emit.c          |  82 ++++----
 src/gallium/drivers/i965/brw_winsys.h           | 243 ++++++++++++++++++++++++
 src/gallium/drivers/i965/brw_wm.h               |   1 -
 src/gallium/drivers/i965/brw_wm_debug.c         |   2 +-
 src/gallium/drivers/i965/brw_wm_emit.c          |  84 ++++----
 src/gallium/drivers/i965/brw_wm_fp.c            |  60 +++---
 src/gallium/drivers/i965/brw_wm_pass0.c         |   1 -
 src/gallium/drivers/i965/brw_wm_pass1.c         |  68 +++----
 src/gallium/drivers/i965/brw_wm_surface_state.c |   2 +-
 src/gallium/drivers/i965/intel_batchbuffer.h    | 168 ----------------
 47 files changed, 1027 insertions(+), 492 deletions(-)
 create mode 100644 src/gallium/drivers/i965/brw_batchbuffer.h
 create mode 100644 src/gallium/drivers/i965/brw_pipe_rast.c
 create mode 100644 src/gallium/drivers/i965/brw_pipe_rast.h
 create mode 100644 src/gallium/drivers/i965/brw_pipe_shader.c
 create mode 100644 src/gallium/drivers/i965/brw_reg.h
 create mode 100644 src/gallium/drivers/i965/brw_screen.h
 create mode 100644 src/gallium/drivers/i965/brw_winsys.h
 delete mode 100644 src/gallium/drivers/i965/intel_batchbuffer.h

(limited to 'src/gallium/drivers/i965/brw_vs.h')

diff --git a/src/gallium/drivers/i965/brw_batchbuffer.h b/src/gallium/drivers/i965/brw_batchbuffer.h
new file mode 100644
index 00000000000..76b3c1bf69a
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_batchbuffer.h
@@ -0,0 +1,124 @@
+#ifndef BRW_BATCHBUFFER_H
+#define BRW_BATCHBUFFER_H
+
+#include "brw_types.h"
+#include "brw_winsys.h"
+#include "brw_reg.h"
+
+#define BATCH_SZ 16384
+#define BATCH_RESERVED 16
+
+/* All ignored:
+ */
+enum cliprect_mode {
+   IGNORE_CLIPRECTS,
+   LOOP_CLIPRECTS,
+   NO_LOOP_CLIPRECTS,
+   REFERENCES_CLIPRECTS
+};
+
+void brw_batchbuffer_free(struct brw_batchbuffer *batch);
+
+void _brw_batchbuffer_flush(struct brw_batchbuffer *batch,
+			      const char *file, int line);
+
+#define brw_batchbuffer_flush(batch) \
+	_brw_batchbuffer_flush(batch, __FILE__, __LINE__)
+
+void brw_batchbuffer_reset(struct brw_batchbuffer *batch);
+
+
+/* Unlike bmBufferData, this currently requires the buffer be mapped.
+ * Consider it a convenience function wrapping multple
+ * intel_buffer_dword() calls.
+ */
+void brw_batchbuffer_data(struct brw_batchbuffer *batch,
+                            const void *data, GLuint bytes,
+			    enum cliprect_mode cliprect_mode);
+
+void brw_batchbuffer_release_space(struct brw_batchbuffer *batch,
+                                     GLuint bytes);
+
+GLboolean brw_batchbuffer_emit_reloc(struct brw_batchbuffer *batch,
+                                       struct brw_winsys_buffer *buffer,
+				       uint32_t read_domains,
+				       uint32_t write_domain,
+				       uint32_t offset);
+
+/* Inline functions - might actually be better off with these
+ * non-inlined.  Certainly better off switching all command packets to
+ * be passed as structs rather than dwords, but that's a little bit of
+ * work...
+ */
+static INLINE GLint
+brw_batchbuffer_space(struct brw_batchbuffer *batch)
+{
+   return (batch->size - BATCH_RESERVED) - (batch->ptr - batch->map);
+}
+
+
+static INLINE void
+brw_batchbuffer_emit_dword(struct brw_batchbuffer *batch, GLuint dword)
+{
+   assert(batch->map);
+   assert(brw_batchbuffer_space(batch) >= 4);
+   *(GLuint *) (batch->ptr) = dword;
+   batch->ptr += 4;
+}
+
+static INLINE boolean
+brw_batchbuffer_require_space(struct brw_batchbuffer *batch,
+                                GLuint sz,
+				enum cliprect_mode cliprect_mode)
+{
+   assert(sz < batch->size - 8);
+   if (brw_batchbuffer_space(batch) < sz) {
+      assert(0);
+      return FALSE;
+   }
+
+   /* All commands should be executed once regardless of cliprect
+    * mode.
+    */
+   (void)cliprect_mode;
+}
+
+/* Here are the crusty old macros, to be removed:
+ */
+#define BATCH_LOCALS
+
+#define BEGIN_BATCH(n, cliprect_mode) do {				\
+   brw_batchbuffer_require_space(intel->batch, (n)*4, cliprect_mode); \
+   assert(intel->batch->emit.start_ptr == NULL);			\
+   intel->batch->emit.total = (n) * 4;					\
+   intel->batch->emit.start_ptr = intel->batch->ptr;			\
+} while (0)
+
+#define OUT_BATCH(d) brw_batchbuffer_emit_dword(intel->batch, d)
+
+#define OUT_RELOC(buf, read_domains, write_domain, delta) do {		\
+   assert((unsigned) (delta) < buf->size);				\
+   brw_batchbuffer_emit_reloc(intel->batch, buf,			\
+				read_domains, write_domain, delta);	\
+} while (0)
+
+#define ADVANCE_BATCH() do {						\
+   unsigned int _n = intel->batch->ptr - intel->batch->emit.start_ptr;	\
+   assert(intel->batch->emit.start_ptr != NULL);			\
+   if (_n != intel->batch->emit.total) {				\
+      fprintf(stderr, "ADVANCE_BATCH: %d of %d dwords emitted\n",	\
+	      _n, intel->batch->emit.total);				\
+      abort();								\
+   }									\
+   intel->batch->emit.start_ptr = NULL;					\
+} while(0)
+
+
+static INLINE void
+brw_batchbuffer_emit_mi_flush(struct brw_batchbuffer *batch)
+{
+   brw_batchbuffer_require_space(batch, 4, IGNORE_CLIPRECTS);
+   brw_batchbuffer_emit_dword(batch, MI_FLUSH);
+}
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_cc.c b/src/gallium/drivers/i965/brw_cc.c
index bf2743ebbe3..c8e7851d756 100644
--- a/src/gallium/drivers/i965/brw_cc.c
+++ b/src/gallium/drivers/i965/brw_cc.c
@@ -65,7 +65,7 @@ static void prepare_cc_vp( struct brw_context *brw )
    memset(&ccv, 0, sizeof(ccv));
 
    /* PIPE_NEW_VIEWPORT */
-   calc_sane_viewport( &brw->vp, &svp );
+   calc_sane_viewport( &brw->curr.vp, &svp );
 
    ccv.min_depth = svp.near;
    ccv.max_depth = svp.far;
@@ -109,13 +109,13 @@ static void
 cc_unit_populate_key(const struct brw_context *brw,
 		     struct brw_cc_unit_key *key)
 {
-   key->cc0 = brw->dsa->cc0;
-   key->cc1 = brw->dsa->cc1;
-   key->cc2 = brw->dsa->cc2;
-   key->cc3 = combine_cc3( brw->dsa->cc3, brw->blend->cc3 );
-   key->cc5 = brw->blend->cc5;
-   key->cc6 = brw->blend->cc6;
-   key->cc7 = brw->blend->cc7;
+   key->cc0 = brw->curr.dsa->cc0;
+   key->cc1 = brw->curr.dsa->cc1;
+   key->cc2 = brw->curr.dsa->cc2;
+   key->cc3 = combine_cc3( brw->curr.dsa->cc3, brw->curr.blend->cc3 );
+   key->cc5 = brw->curr.blend->cc5;
+   key->cc6 = brw->curr.blend->cc6;
+   key->cc7 = brw->curr.blend->cc7;
 }
 
 /**
diff --git a/src/gallium/drivers/i965/brw_clip.c b/src/gallium/drivers/i965/brw_clip.c
index d82ebeb9a9b..591e9047053 100644
--- a/src/gallium/drivers/i965/brw_clip.c
+++ b/src/gallium/drivers/i965/brw_clip.c
@@ -33,13 +33,14 @@
 
 #include "util/u_math.h"
 
-#include "intel_batchbuffer.h"
-
+#include "brw_screen.h"
+#include "brw_batchbuffer.h"
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
 #include "brw_util.h"
 #include "brw_state.h"
+#include "brw_pipe_rast.h"
 #include "brw_clip.h"
 
 
@@ -77,13 +78,16 @@ static void compile_clip_prog( struct brw_context *brw,
    else
        delta = REG_SIZE;
 
-   for (i = 0; i < VERT_RESULT_MAX; i++)
-      if (c.key.attrs & (1<<i)) {
-	 c.offset[i] = delta;
-	 delta += ATTR_SIZE;
-      }
+   /* XXX: c.offset is now pretty redundant:
+    */
+   for (i = 0; i < c.key.nr_attrs; i++) {
+      c.offset[i] = delta;
+      delta += ATTR_SIZE;
+   }
 
-   c.nr_attrs = util_count_bits(c.key.attrs);
+   /* XXX: c.nr_attrs is very redundant:
+    */
+   c.nr_attrs = c.key.nr_attrs;
    
    if (BRW_IS_IGDNG(brw))
        c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
@@ -145,59 +149,21 @@ static void upload_clip_prog(struct brw_context *brw)
 {
    struct brw_clip_prog_key key;
 
-   memset(&key, 0, sizeof(key));
-
-   /* Populate the key:
+   /* Populate the key, starting from the almost-complete version from
+    * the rast state. 
     */
+
+   /* PIPE_NEW_RAST */
+   memcpy(&key, &brw->curr.rast->clip_key, sizeof key);
+
    /* BRW_NEW_REDUCED_PRIMITIVE */
    key.primitive = brw->reduced_primitive;
-   /* CACHE_NEW_VS_PROG */
-   key.attrs = brw->vs.prog_data->outputs_written;
-   /* PIPE_NEW_RAST */
-   key.do_flat_shading = brw->rast.base.flatshade;
-   /* PIPE_NEW_UCP */
-   key.nr_userclip = brw->nr_ucp;
 
-   if (BRW_IS_IGDNG(brw))
-       key.clip_mode = BRW_CLIPMODE_KERNEL_CLIP;
-   else
-       key.clip_mode = BRW_CLIPMODE_NORMAL;
+   /* PIPE_NEW_VS */
+   key.nr_attrs = brw->curr.vs->info.file_max[TGSI_FILE_OUTPUT] + 1;
 
-   /* PIPE_NEW_RAST */
-   if (key.primitive == PIPE_PRIM_TRIANGLES) {
-      if (brw->rast->cull_mode = PIPE_WINDING_BOTH)
-	 key.clip_mode = BRW_CLIPMODE_REJECT_ALL;
-      else {
-	 key.fill_ccw = CLIP_CULL;
-	 key.fill_cw = CLIP_CULL;
-
-	 if (!(brw->rast->cull_mode & PIPE_WINDING_CCW)) {
-	    key.fill_ccw = translate_fill(brw->rast.fill_ccw);
-	 }
-
-	 if (!(brw->rast->cull_mode & PIPE_WINDING_CW)) {
-	    key.fill_cw = translate_fill(brw->rast.fill_cw);
-	 }
-
-	 if (key.fill_cw != CLIP_FILL ||
-	     key.fill_ccw != CLIP_FILL) {
-	    key.do_unfilled = 1;
-	    key.clip_mode = BRW_CLIPMODE_CLIP_NON_REJECTED;
-	 }
-
-	 key.offset_ccw = brw->rast.offset_ccw;
-	 key.offset_cw = brw->rast.offset_cw;
-
-	 if (brw->rast.light_twoside &&
-	     key.fill_cw != CLIP_CULL) 
-	    key.copy_bfc_cw = 1;
-
-	 if (brw->rast.light_twoside &&
-	     key.fill_ccw != CLIP_CULL) 
-	    key.copy_bfc_ccw = 1;
-	 }
-      }
-   }
+   /* PIPE_NEW_CLIP */
+   key.nr_userclip = brw->curr.ucp.nr;
 
    brw->sws->bo_unreference(brw->clip.prog_bo);
    brw->clip.prog_bo = brw_search_cache(&brw->cache, BRW_CLIP_PROG,
@@ -212,7 +178,7 @@ static void upload_clip_prog(struct brw_context *brw)
 const struct brw_tracked_state brw_clip_prog = {
    .dirty = {
       .mesa  = (PIPE_NEW_RAST | 
-		PIPE_NEW_UCP),
+		PIPE_NEW_CLIP),
       .brw   = (BRW_NEW_REDUCED_PRIMITIVE),
       .cache = CACHE_NEW_VS_PROG
    },
diff --git a/src/gallium/drivers/i965/brw_clip.h b/src/gallium/drivers/i965/brw_clip.h
index d80ec819b97..cfe51bf2920 100644
--- a/src/gallium/drivers/i965/brw_clip.h
+++ b/src/gallium/drivers/i965/brw_clip.h
@@ -42,8 +42,7 @@
  * up polygon offset and flatshading at this point:
  */
 struct brw_clip_prog_key {
-   GLuint attrs:32;		
-
+   GLuint nr_attrs:5;
    GLuint primitive:4;
    GLuint nr_userclip:3;
    GLuint do_flat_shading:1;
@@ -55,7 +54,7 @@ struct brw_clip_prog_key {
    GLuint copy_bfc_cw:1;
    GLuint copy_bfc_ccw:1;
    GLuint clip_mode:3;
-   GLuint pad1:12;
+   GLuint pad1:7;
    
    GLfloat offset_factor;
    GLfloat offset_units;
@@ -117,7 +116,7 @@ struct brw_clip_compile {
    GLuint last_mrf;
 
    GLuint header_position_offset;
-   GLuint offset[VERT_ATTRIB_MAX];
+   GLuint offset[PIPE_MAX_SHADER_OUTPUTS];
    GLboolean need_ff_sync;
 };
 
diff --git a/src/gallium/drivers/i965/brw_clip_unfilled.c b/src/gallium/drivers/i965/brw_clip_unfilled.c
index 4baff55806e..8501599aef8 100644
--- a/src/gallium/drivers/i965/brw_clip_unfilled.c
+++ b/src/gallium/drivers/i965/brw_clip_unfilled.c
@@ -29,7 +29,7 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 #include "brw_defines.h"
 #include "brw_context.h"
diff --git a/src/gallium/drivers/i965/brw_clip_util.c b/src/gallium/drivers/i965/brw_clip_util.c
index 7a6c46ce077..60bfd3538e2 100644
--- a/src/gallium/drivers/i965/brw_clip_util.c
+++ b/src/gallium/drivers/i965/brw_clip_util.c
@@ -93,7 +93,7 @@ void brw_clip_project_position(struct brw_clip_compile *c, struct brw_reg pos )
    /* value.xyz *= value.rhw
     */
    brw_set_access_mode(p, BRW_ALIGN_16);
-   brw_MUL(p, brw_writemask(pos, WRITEMASK_XYZ), pos, brw_swizzle1(pos, W));
+   brw_MUL(p, brw_writemask(pos, BRW_WRITEMASK_XYZ), pos, brw_swizzle1(pos, W));
    brw_set_access_mode(p, BRW_ALIGN_1);
 }
 
diff --git a/src/gallium/drivers/i965/brw_context.c b/src/gallium/drivers/i965/brw_context.c
index 063ada57728..07a5420d6ee 100644
--- a/src/gallium/drivers/i965/brw_context.c
+++ b/src/gallium/drivers/i965/brw_context.c
@@ -38,7 +38,7 @@
 #include "brw_state.h"
 #include "brw_vs.h"
 #include "brw_screen_tex.h"
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 
diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
index 6699d3bdb6d..3a2fece45c6 100644
--- a/src/gallium/drivers/i965/brw_context.h
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -36,6 +36,8 @@
 #include "brw_structs.h"
 #include "brw_winsys.h"
 #include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "tgsi/tgsi_scan.h"
 
 
 /* Glossary:
@@ -143,6 +145,27 @@ struct brw_blend_state {
 };
 
 
+struct brw_rasterizer_state;
+
+
+struct brw_vertex_shader {
+   const struct tgsi_token *tokens;
+   struct tgsi_shader_info info;
+
+   struct brw_winsys_buffer *const_buffer;    /** Program constant buffer/surface */
+   GLboolean use_const_buffer;
+};
+
+
+struct brw_fragment_shader {
+   const struct tgsi_token *tokens;
+   struct tgsi_shader_info info;
+
+   GLboolean isGLSL;
+
+   struct brw_winsys_buffer *const_buffer;    /** Program constant buffer/surface */
+   GLboolean use_const_buffer;
+};
 
 
@@ -157,6 +180,7 @@ struct brw_blend_state {
 #define PIPE_NEW_VERTEX_SHADER          0x2
 #define PIPE_NEW_FRAGMENT_CONSTS        0x2
 #define PIPE_NEW_VERTEX_CONSTS          0x2
+#define PIPE_NEW_CLIP                   0x2
 
 
 #define BRW_NEW_URB_FENCE               0x1
@@ -196,25 +220,6 @@ struct brw_state_flags {
 };
 
 
-struct brw_vertex_program {
-   const struct tgsi_token *tokens;
-   GLuint id;
-   struct brw_winsys_buffer *const_buffer;    /** Program constant buffer/surface */
-   GLboolean use_const_buffer;
-};
-
-
-/** Subclass of Mesa fragment program */
-struct brw_fragment_program {
-   const struct tgsi_token *tokens;
-
-   GLuint id;  /**< serial no. to identify frag progs, never re-used */
-   GLboolean isGLSL;  /**< any IF/LOOP/CONT/BREAK instructions */
-
-   struct brw_winsys_buffer *const_buffer;    /** Program constant buffer/surface */
-   GLboolean use_const_buffer;
-};
-
 
 /* Data about a particular attempt to compile a program.  Note that
  * there can be many of these, each in a different GL state
@@ -452,24 +457,29 @@ struct brw_query_object {
  */
 struct brw_context 
 {
-   struct pipe_context *pipe;
-   struct pipe_screen *screen;
-   
+   struct pipe_context pipe;
+
+   struct brw_screen *brw_screen;   
    struct brw_winsys_screen *sws;
 
    GLuint primitive;
+   GLuint reduced_primitive;
 
    GLboolean emit_state_always;
    GLboolean no_batch_wrap;
 
    /* Active vertex program: 
     */
-   const struct gl_vertex_program *vertex_program;
-   const struct gl_fragment_program *fragment_program;
-   struct pipe_framebuffer_state fb;
-   struct brw_depth_stencil_alpha_state *dsa;
-   struct brw_blend_state *blend;
-   struct pipe_viewport_state vp;
+   struct {
+      const struct brw_vertex_shader *vs;
+      const struct brw_fragment_shader *fs;
+      const struct brw_blend_state *blend;
+      const struct brw_rasterizer_state *rast;
+      const struct brw_depth_stencil_alpha_state *dsa;
+      struct pipe_framebuffer_state fb;
+      struct pipe_viewport_state vp;
+      struct pipe_clip_state ucp;
+   } curr;
 
    struct {
       struct brw_state_flags dirty;
@@ -719,29 +729,6 @@ brw_context( struct pipe_context *ctx )
    return (struct brw_context *)ctx;
 }
 
-static INLINE struct brw_vertex_program *
-brw_vertex_program(struct gl_vertex_program *p)
-{
-   return (struct brw_vertex_program *) p;
-}
-
-static INLINE const struct brw_vertex_program *
-brw_vertex_program_const(const struct gl_vertex_program *p)
-{
-   return (const struct brw_vertex_program *) p;
-}
-
-static INLINE struct brw_fragment_program *
-brw_fragment_program(struct gl_fragment_program *p)
-{
-   return (struct brw_fragment_program *) p;
-}
-
-static INLINE const struct brw_fragment_program *
-brw_fragment_program_const(const struct gl_fragment_program *p)
-{
-   return (const struct brw_fragment_program *) p;
-}
 
 
diff --git a/src/gallium/drivers/i965/brw_curbe.c b/src/gallium/drivers/i965/brw_curbe.c
index 33ea9a00f74..f2524d75e2c 100644
--- a/src/gallium/drivers/i965/brw_curbe.c
+++ b/src/gallium/drivers/i965/brw_curbe.c
@@ -30,7 +30,7 @@
   */
 
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 #include "intel_regions.h"
 #include "brw_context.h"
 #include "brw_defines.h"
@@ -55,8 +55,8 @@ static void calculate_curbe_offsets( struct brw_context *brw )
    GLuint nr_clip_regs = 0;
    GLuint total_regs;
 
-   /* PIPE_NEW_UCP */
-   if (brw->nr_ucp) {
+   /* PIPE_NEW_CLIP */
+   if (brw->curr.ucp.nr) {
       GLuint nr_planes = 6 + brw->nr_ucp;
       nr_clip_regs = (nr_planes * 4 + 15) / 16;
    }
@@ -106,7 +106,7 @@ static void calculate_curbe_offsets( struct brw_context *brw )
 
 const struct brw_tracked_state brw_curbe_offsets = {
    .dirty = {
-      .mesa = PIPE_NEW_UCP,
+      .mesa = PIPE_NEW_CLIP,
       .brw  = BRW_NEW_VERTEX_PROGRAM,
       .cache = CACHE_NEW_WM_PROG
    },
@@ -327,7 +327,7 @@ const struct brw_tracked_state brw_constant_buffer = {
    .dirty = {
       .mesa = (PIPE_NEW_FS_CONSTANTS |
 	       PIPE_NEW_VS_CONSTANTS |
-	       PIPE_NEW_UCP),
+	       PIPE_NEW_CLIP),
       .brw  = (BRW_NEW_FRAGMENT_PROGRAM |
 	       BRW_NEW_VERTEX_PROGRAM |
 	       BRW_NEW_URB_FENCE | /* Implicit - hardware requires this, not used above */
diff --git a/src/gallium/drivers/i965/brw_defines.h b/src/gallium/drivers/i965/brw_defines.h
index 282c5b18f4e..1dc64ddc8fa 100644
--- a/src/gallium/drivers/i965/brw_defines.h
+++ b/src/gallium/drivers/i965/brw_defines.h
@@ -840,8 +840,8 @@
 
 #include "intel_chipset.h"
 
-#define BRW_IS_G4X(brw)         (IS_G4X((brw)->brw_screen->deviceID))
-#define BRW_IS_IGDNG(brw)         (IS_IGDNG((brw)->brw_screen->deviceID))
+#define BRW_IS_G4X(brw)         (IS_G4X((brw)->brw_screen->pci_id))
+#define BRW_IS_IGDNG(brw)         (IS_IGDNG((brw)->brw_screen->pci_id))
 #define BRW_IS_965(brw)         (!(BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)))
 #define CMD_PIPELINE_SELECT(brw)        ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_PIPELINE_SELECT_GM45 : CMD_PIPELINE_SELECT_965)
 #define CMD_VF_STATISTICS(brw)          ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_VF_STATISTICS_GM45 : CMD_VF_STATISTICS_965)
diff --git a/src/gallium/drivers/i965/brw_draw.c b/src/gallium/drivers/i965/brw_draw.c
index 856999f3ef8..741537309ad 100644
--- a/src/gallium/drivers/i965/brw_draw.c
+++ b/src/gallium/drivers/i965/brw_draw.c
@@ -31,7 +31,7 @@
 #include "brw_context.h"
 #include "brw_state.h"
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 #include "intel_buffer_objects.h"
 
 #define FILE_DEBUG_FLAG DEBUG_BATCH
@@ -133,7 +133,7 @@ static void brw_emit_prim(struct brw_context *brw,
       ADVANCE_BATCH();
    }
    if (prim_packet.verts_per_instance) {
-      intel_batchbuffer_data( brw->intel.batch, &prim_packet,
+      brw_batchbuffer_data( brw->intel.batch, &prim_packet,
 			      sizeof(prim_packet), LOOP_CLIPRECTS);
    }
    if (intel->always_flush_cache) {
@@ -224,7 +224,7 @@ static GLboolean brw_try_draw_prims( struct brw_context *brw,
       return ret;
 
    if (intel->always_flush_batch)
-      intel_batchbuffer_flush(intel->batch);
+      brw_batchbuffer_flush(intel->batch);
 
    return 0;
 }
@@ -249,12 +249,10 @@ void brw_draw_prims( struct brw_context *brw,
     */
    ret = brw_try_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
 
-   /* Otherwise, we really are out of memory.  Pass the drawing
-    * command to the software tnl module and which will in turn call
-    * swrast to do the drawing.
+   /* Otherwise, flush and retry:
     */
    if (ret != 0) {
-      intel_batchbuffer_flush(intel->batch);
+      brw_batchbuffer_flush(intel->batch);
       ret = brw_try_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
       assert(ret == 0);
    }
diff --git a/src/gallium/drivers/i965/brw_draw_upload.c b/src/gallium/drivers/i965/brw_draw_upload.c
index dce015d79f9..1ab65d60c41 100644
--- a/src/gallium/drivers/i965/brw_draw_upload.c
+++ b/src/gallium/drivers/i965/brw_draw_upload.c
@@ -35,7 +35,7 @@
 #include "brw_state.h"
 #include "brw_fallback.h"
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 #include "intel_buffer_objects.h"
 #include "intel_tex.h"
 
diff --git a/src/gallium/drivers/i965/brw_eu.h b/src/gallium/drivers/i965/brw_eu.h
index 30603bdd0e6..46d52a473b6 100644
--- a/src/gallium/drivers/i965/brw_eu.h
+++ b/src/gallium/drivers/i965/brw_eu.h
@@ -35,7 +35,6 @@
 
 #include "brw_structs.h"
 #include "brw_defines.h"
-#include "shader/prog_instruction.h"
 
 #define BRW_SWIZZLE4(a,b,c,d) (((a)<<0) | ((b)<<2) | ((c)<<4) | ((d)<<6))
 #define BRW_GET_SWZ(swz, idx) (((swz) >> ((idx)*2)) & 0x3)
@@ -45,6 +44,23 @@
 #define BRW_SWIZZLE_XXXX      BRW_SWIZZLE4(0,0,0,0)
 #define BRW_SWIZZLE_XYXY      BRW_SWIZZLE4(0,1,0,1)
 
+#define BRW_WRITEMASK_NONE     0x00
+#define BRW_WRITEMASK_X        0x01
+#define BRW_WRITEMASK_Y        0x02
+#define BRW_WRITEMASK_XY       0x03
+#define BRW_WRITEMASK_Z        0x04
+#define BRW_WRITEMASK_XZ       0x05
+#define BRW_WRITEMASK_YZ       0x06
+#define BRW_WRITEMASK_XYZ      0x07
+#define BRW_WRITEMASK_W        0x08
+#define BRW_WRITEMASK_XW       0x09
+#define BRW_WRITEMASK_YW       0x0A
+#define BRW_WRITEMASK_XYW      0x0B
+#define BRW_WRITEMASK_ZW       0x0C
+#define BRW_WRITEMASK_XZW      0x0D
+#define BRW_WRITEMASK_YZW      0x0E
+#define BRW_WRITEMASK_XYZW     0x0F
+
 
 #define REG_SIZE (8*4)
 
@@ -157,7 +173,7 @@ static INLINE int type_sz( GLuint type )
  * \param width  one of BRW_WIDTH_x
  * \param hstride  one of BRW_HORIZONTAL_STRIDE_x
  * \param swizzle  one of BRW_SWIZZLE_x
- * \param writemask  WRITEMASK_X/Y/Z/W bitfield
+ * \param writemask  BRW_WRITEMASK_X/Y/Z/W bitfield
  */
 static INLINE struct brw_reg brw_reg( GLuint file,
                                       GLuint nr,
@@ -215,7 +231,7 @@ static INLINE struct brw_reg brw_vec16_reg( GLuint file,
 		  BRW_WIDTH_16,
 		  BRW_HORIZONTAL_STRIDE_1,
 		  BRW_SWIZZLE_XYZW,
-		  WRITEMASK_XYZW);
+		  BRW_WRITEMASK_XYZW);
 }
 
 /** Construct float[8] register */
@@ -231,7 +247,7 @@ static INLINE struct brw_reg brw_vec8_reg( GLuint file,
 		  BRW_WIDTH_8,
 		  BRW_HORIZONTAL_STRIDE_1,
 		  BRW_SWIZZLE_XYZW,
-		  WRITEMASK_XYZW);
+		  BRW_WRITEMASK_XYZW);
 }
 
 /** Construct float[4] register */
@@ -247,7 +263,7 @@ static INLINE struct brw_reg brw_vec4_reg( GLuint file,
 		  BRW_WIDTH_4,
 		  BRW_HORIZONTAL_STRIDE_1,
 		  BRW_SWIZZLE_XYZW,
-		  WRITEMASK_XYZW);
+		  BRW_WRITEMASK_XYZW);
 }
 
 /** Construct float[2] register */
@@ -263,7 +279,7 @@ static INLINE struct brw_reg brw_vec2_reg( GLuint file,
 		  BRW_WIDTH_2,
 		  BRW_HORIZONTAL_STRIDE_1,
 		  BRW_SWIZZLE_XYXY,
-		  WRITEMASK_XY);
+		  BRW_WRITEMASK_XY);
 }
 
 /** Construct float[1] register */
@@ -279,7 +295,7 @@ static INLINE struct brw_reg brw_vec1_reg( GLuint file,
 		  BRW_WIDTH_1,
 		  BRW_HORIZONTAL_STRIDE_0,
 		  BRW_SWIZZLE_XXXX,
-		  WRITEMASK_X);
+		  BRW_WRITEMASK_X);
 }
 
 
@@ -510,7 +526,7 @@ static INLINE struct brw_reg brw_ip_reg( void )
 		  BRW_WIDTH_1,
 		  BRW_HORIZONTAL_STRIDE_0,
 		  BRW_SWIZZLE_XYZW, /* NOTE! */
-		  WRITEMASK_XYZW); /* NOTE! */
+		  BRW_WRITEMASK_XYZW); /* NOTE! */
 }
 
 static INLINE struct brw_reg brw_acc_reg( void )
diff --git a/src/gallium/drivers/i965/brw_eu_emit.c b/src/gallium/drivers/i965/brw_eu_emit.c
index 241cdc33f86..f6b8843e01e 100644
--- a/src/gallium/drivers/i965/brw_eu_emit.c
+++ b/src/gallium/drivers/i965/brw_eu_emit.c
@@ -1276,7 +1276,7 @@ void brw_SAMPLE(struct brw_compile *p,
     * instruction, so that is a guide for whether a workaround is
     * needed.
     */
-   if (writemask != WRITEMASK_XYZW) {
+   if (writemask != BRW_WRITEMASK_XYZW) {
       GLuint dst_offset = 0;
       GLuint i, newmask = 0, len = 0;
 
@@ -1299,7 +1299,7 @@ void brw_SAMPLE(struct brw_compile *p,
       else {
 	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
 	 
-	 newmask = ~newmask & WRITEMASK_XYZW;
+	 newmask = ~newmask & BRW_WRITEMASK_XYZW;
 
 	 brw_push_insn_state(p);
 
diff --git a/src/gallium/drivers/i965/brw_gs.c b/src/gallium/drivers/i965/brw_gs.c
index 58930e7964f..692ce46679c 100644
--- a/src/gallium/drivers/i965/brw_gs.c
+++ b/src/gallium/drivers/i965/brw_gs.c
@@ -29,7 +29,7 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
       
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 #include "brw_defines.h"
 #include "brw_context.h"
diff --git a/src/gallium/drivers/i965/brw_gs_emit.c b/src/gallium/drivers/i965/brw_gs_emit.c
index 9ec206d7e89..fd8e2accedd 100644
--- a/src/gallium/drivers/i965/brw_gs_emit.c
+++ b/src/gallium/drivers/i965/brw_gs_emit.c
@@ -30,7 +30,7 @@
   */
  
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 #include "brw_defines.h"
 #include "brw_context.h"
diff --git a/src/gallium/drivers/i965/brw_misc_state.c b/src/gallium/drivers/i965/brw_misc_state.c
index d33bf40a011..eb39be85454 100644
--- a/src/gallium/drivers/i965/brw_misc_state.c
+++ b/src/gallium/drivers/i965/brw_misc_state.c
@@ -31,7 +31,7 @@
  
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 #include "intel_regions.h"
 
 #include "brw_context.h"
diff --git a/src/gallium/drivers/i965/brw_pipe_flush.c b/src/gallium/drivers/i965/brw_pipe_flush.c
index d5b7bd3b83f..e85a1a9c1b3 100644
--- a/src/gallium/drivers/i965/brw_pipe_flush.c
+++ b/src/gallium/drivers/i965/brw_pipe_flush.c
@@ -1,6 +1,6 @@
 
 /**
- * called from intel_batchbuffer_flush and children before sending a
+ * called from brw_batchbuffer_flush and children before sending a
  * batchbuffer off.
  */
 static void brw_finish_batch(struct intel_context *intel)
diff --git a/src/gallium/drivers/i965/brw_pipe_query.c b/src/gallium/drivers/i965/brw_pipe_query.c
index 0b9ba0c0edb..55242ac6adf 100644
--- a/src/gallium/drivers/i965/brw_pipe_query.c
+++ b/src/gallium/drivers/i965/brw_pipe_query.c
@@ -42,7 +42,7 @@
 
 #include "brw_context.h"
 #include "brw_state.h"
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 #include "intel_reg.h"
 
 /** Waits on the query object's BO and totals the results for this query */
@@ -122,7 +122,7 @@ brw_end_query(struct pipe_context *pipe, struct pipe_query *q)
     */
    if (query->bo) {
       brw_emit_query_end(brw);
-      intel_batchbuffer_flush(brw->batch);
+      brw_batchbuffer_flush(brw->batch);
 
       brw->sws->bo_unreference(brw->query.bo);
       brw->query.bo = NULL;
diff --git a/src/gallium/drivers/i965/brw_pipe_rast.c b/src/gallium/drivers/i965/brw_pipe_rast.c
new file mode 100644
index 00000000000..ff64dbd48d4
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_rast.c
@@ -0,0 +1,46 @@
+
+static void
+calculate_clip_key_rast()
+{
+   if (BRW_IS_IGDNG(brw))
+       key.clip_mode = BRW_CLIPMODE_KERNEL_CLIP;
+   else
+       key.clip_mode = BRW_CLIPMODE_NORMAL;
+
+   key.do_flat_shading = brw->rast->templ.flatshade;
+
+   if (key.primitive == PIPE_PRIM_TRIANGLES) {
+      if (brw->rast->templ.cull_mode = PIPE_WINDING_BOTH)
+	 key.clip_mode = BRW_CLIPMODE_REJECT_ALL;
+      else {
+	 key.fill_ccw = CLIP_CULL;
+	 key.fill_cw = CLIP_CULL;
+
+	 if (!(brw->rast->templ.cull_mode & PIPE_WINDING_CCW)) {
+	    key.fill_ccw = translate_fill(brw->rast.fill_ccw);
+	 }
+
+	 if (!(brw->rast->templ.cull_mode & PIPE_WINDING_CW)) {
+	    key.fill_cw = translate_fill(brw->rast.fill_cw);
+	 }
+
+	 if (key.fill_cw != CLIP_FILL ||
+	     key.fill_ccw != CLIP_FILL) {
+	    key.do_unfilled = 1;
+	    key.clip_mode = BRW_CLIPMODE_CLIP_NON_REJECTED;
+	 }
+
+	 key.offset_ccw = brw->rast.templ.offset_ccw;
+	 key.offset_cw = brw->rast.templ.offset_cw;
+
+	 if (brw->rast.templ.light_twoside &&
+	     key.fill_cw != CLIP_CULL) 
+	    key.copy_bfc_cw = 1;
+
+	 if (brw->rast.templ.light_twoside &&
+	     key.fill_ccw != CLIP_CULL) 
+	    key.copy_bfc_ccw = 1;
+	 }
+      }
+   }
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_rast.h b/src/gallium/drivers/i965/brw_pipe_rast.h
new file mode 100644
index 00000000000..6ceaa1fb099
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_rast.h
@@ -0,0 +1,14 @@
+#ifndef BRW_PIPE_RAST_H
+#define BRW_PIPE_RAST_H
+
+#include "brw_clip.h"
+
+struct brw_rasterizer_state {
+   struct pipe_rasterizer_state templ; /* for draw module */
+
+   /* Precalculated hardware state:
+    */
+   struct brw_clip_prog_key clip_key;
+};
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_pipe_shader.c b/src/gallium/drivers/i965/brw_pipe_shader.c
new file mode 100644
index 00000000000..fbb772d18c9
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_shader.c
@@ -0,0 +1,159 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+  
+#include "brw_context.h"
+#include "brw_util.h"
+#include "brw_wm.h"
+
+static void brwBindProgram( struct brw_context *brw,
+			    GLenum target, 
+			    struct gl_program *prog )
+{
+   struct brw_context *brw = brw_context(ctx);
+
+   switch (target) {
+   case GL_VERTEX_PROGRAM_ARB: 
+      brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM;
+      break;
+   case GL_FRAGMENT_PROGRAM_ARB:
+      brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM;
+      break;
+   }
+}
+
+static struct gl_program *brwNewProgram( structg brw_context *brw,
+				      GLenum target, 
+				      GLuint id )
+{
+   struct brw_context *brw = brw_context(ctx);
+
+   switch (target) {
+   case GL_VERTEX_PROGRAM_ARB: {
+      struct brw_vertex_program *prog = CALLOC_STRUCT(brw_vertex_program);
+      if (prog) {
+	 prog->id = brw->program_id++;
+
+	 return _mesa_init_vertex_program( ctx, &prog->program,
+					     target, id );
+      }
+      else
+	 return NULL;
+   }
+
+   case GL_FRAGMENT_PROGRAM_ARB: {
+      struct brw_fragment_program *prog = CALLOC_STRUCT(brw_fragment_program);
+      if (prog) {
+	 prog->id = brw->program_id++;
+
+	 return _mesa_init_fragment_program( ctx, &prog->program,
+					     target, id );
+      }
+      else
+	 return NULL;
+   }
+
+   default:
+      return _mesa_new_program(ctx, target, id);
+   }
+}
+
+static void brwDeleteProgram( struct brw_context *brw,
+			      struct gl_program *prog )
+{
+   if (prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
+      struct gl_fragment_program *fprog = (struct gl_fragment_program *) prog;
+      struct brw_fragment_program *brw_fprog = brw_fragment_program(fprog);
+      brw->sws->bo_unreference(brw_fprog->const_buffer);
+   }
+
+   _mesa_delete_program( ctx, prog );
+}
+
+
+static GLboolean brwIsProgramNative( struct brw_context *brw,
+				     GLenum target, 
+				     struct gl_program *prog )
+{
+   return GL_TRUE;
+}
+
+static void brwProgramStringNotify( struct brw_context *brw,
+				    GLenum target,
+				    struct gl_program *prog )
+{
+   struct brw_context *brw = brw_context(ctx);
+
+   if (target == GL_FRAGMENT_PROGRAM_ARB) {
+      struct gl_fragment_program *fprog = (struct gl_fragment_program *) prog;
+      struct brw_fragment_program *newFP = brw_fragment_program(fprog);
+      const struct brw_fragment_program *curFP =
+         brw_fragment_program_const(brw->fragment_program);
+
+      if (fprog->FogOption) {
+         _mesa_append_fog_code(ctx, fprog);
+         fprog->FogOption = GL_NONE;
+      }
+
+      if (newFP == curFP)
+	 brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM;
+      newFP->id = brw->program_id++;      
+      newFP->isGLSL = brw_wm_is_glsl(fprog);
+   }
+   else if (target == GL_VERTEX_PROGRAM_ARB) {
+      struct gl_vertex_program *vprog = (struct gl_vertex_program *) prog;
+      struct brw_vertex_program *newVP = brw_vertex_program(vprog);
+      const struct brw_vertex_program *curVP =
+         brw_vertex_program_const(brw->vertex_program);
+
+      if (newVP == curVP)
+	 brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM;
+      if (newVP->program.IsPositionInvariant) {
+	 _mesa_insert_mvp_code(ctx, &newVP->program);
+      }
+      newVP->id = brw->program_id++;      
+
+      /* Also tell tnl about it:
+       */
+      _tnl_program_string(ctx, target, prog);
+   }
+}
+
+void brwInitFragProgFuncs( struct dd_function_table *functions )
+{
+   assert(functions->ProgramStringNotify == _tnl_program_string); 
+
+   functions->BindProgram = brwBindProgram;
+   functions->NewProgram = brwNewProgram;
+   functions->DeleteProgram = brwDeleteProgram;
+   functions->IsProgramNative = brwIsProgramNative;
+   functions->ProgramStringNotify = brwProgramStringNotify;
+}
+
diff --git a/src/gallium/drivers/i965/brw_reg.h b/src/gallium/drivers/i965/brw_reg.h
new file mode 100644
index 00000000000..a640104d71b
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_reg.h
@@ -0,0 +1,79 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef BRW_REG_H
+#define BRW_REG_H
+
+#define CMD_MI				(0x0 << 29)
+#define CMD_2D				(0x2 << 29)
+#define CMD_3D				(0x3 << 29)
+
+#define MI_NOOP				(CMD_MI | 0)
+#define MI_BATCH_BUFFER_END		(CMD_MI | 0xA << 23)
+#define MI_FLUSH			(CMD_MI | (4 << 23))
+
+#define _3DSTATE_DRAWRECT_INFO_I965	(CMD_3D | (3 << 27) | (1 << 24) | 0x2)
+
+/** @{
+ *
+ * PIPE_CONTROL operation, a combination MI_FLUSH and register write with
+ * additional flushing control.
+ */
+#define _3DSTATE_PIPE_CONTROL		(CMD_3D | (3 << 27) | (2 << 24) | 2)
+#define PIPE_CONTROL_NO_WRITE		(0 << 14)
+#define PIPE_CONTROL_WRITE_IMMEDIATE	(1 << 14)
+#define PIPE_CONTROL_WRITE_DEPTH_COUNT	(2 << 14)
+#define PIPE_CONTROL_WRITE_TIMESTAMP	(3 << 14)
+#define PIPE_CONTROL_DEPTH_STALL	(1 << 13)
+#define PIPE_CONTROL_WRITE_FLUSH	(1 << 12)
+#define PIPE_CONTROL_INSTRUCTION_FLUSH	(1 << 11)
+#define PIPE_CONTROL_INTERRUPT_ENABLE	(1 << 8)
+#define PIPE_CONTROL_PPGTT_WRITE	(0 << 2)
+#define PIPE_CONTROL_GLOBAL_GTT_WRITE	(1 << 2)
+
+/** @} */
+
+#define XY_SETUP_BLT_CMD		(CMD_2D | (0x01 << 22) | 6)
+#define XY_COLOR_BLT_CMD		(CMD_2D | (0x50 << 22) | 4)
+#define XY_SRC_COPY_BLT_CMD             (CMD_2D | (0x53 << 22) | 6)
+
+/* BR00 */
+#define XY_BLT_WRITE_ALPHA	(1 << 21)
+#define XY_BLT_WRITE_RGB	(1 << 20)
+#define XY_SRC_TILED		(1 << 15)
+#define XY_DST_TILED		(1 << 11)
+
+/* BR13 */
+#define BR13_565		(0x1 << 24)
+#define BR13_8888		(0x3 << 24)
+
+#define FENCE_LINEAR 0
+#define FENCE_XMAJOR 1
+#define FENCE_YMAJOR 2
+
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_screen.h b/src/gallium/drivers/i965/brw_screen.h
new file mode 100644
index 00000000000..716b55c52bc
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_screen.h
@@ -0,0 +1,78 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef BRW_SCREEN_H
+#define BRW_SCREEN_H
+
+#include "pipe/p_state.h"
+#include "pipe/p_screen.h"
+
+
+struct brw_winsys_screen;
+
+
+/**
+ * Subclass of pipe_screen
+ */
+struct brw_screen
+{
+   struct pipe_screen base;
+
+   struct brw_winsys_screen *sws;
+
+   boolean is_i945;
+   uint pci_id;
+};
+
+/**
+ * Subclass of pipe_transfer
+ */
+struct brw_transfer
+{
+   struct pipe_transfer base;
+
+   unsigned offset;
+};
+
+
+/*
+ * Cast wrappers
+ */
+static INLINE struct brw_screen *
+brw_screen(struct pipe_screen *pscreen)
+{
+   return (struct brw_screen *) pscreen;
+}
+
+static INLINE struct brw_transfer *
+brw_transfer(struct pipe_transfer *transfer)
+{
+   return (struct brw_transfer *)transfer;
+}
+
+
+#endif /* BRW_SCREEN_H */
diff --git a/src/gallium/drivers/i965/brw_screen_surface.c b/src/gallium/drivers/i965/brw_screen_surface.c
index d199d0b81a8..544be6a0896 100644
--- a/src/gallium/drivers/i965/brw_screen_surface.c
+++ b/src/gallium/drivers/i965/brw_screen_surface.c
@@ -1,6 +1,6 @@
    /* _NEW_BUFFERS */
-   if (IS_965(intel->intelScreen->deviceID) &&
-       !IS_G4X(intel->intelScreen->deviceID)) {
+   if (IS_965(brw->brw_screen->pci_id) &&
+       !IS_G4X(brw->brw_screen->pci_id)) {
       for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
 	 struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
 	 struct intel_renderbuffer *irb = intel_renderbuffer(rb);
diff --git a/src/gallium/drivers/i965/brw_sf.c b/src/gallium/drivers/i965/brw_sf.c
index 0115f77c08e..54202cbd123 100644
--- a/src/gallium/drivers/i965/brw_sf.c
+++ b/src/gallium/drivers/i965/brw_sf.c
@@ -30,7 +30,7 @@
   */
   
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 #include "brw_defines.h"
 #include "brw_context.h"
diff --git a/src/gallium/drivers/i965/brw_sf.h b/src/gallium/drivers/i965/brw_sf.h
index 26c2e8891aa..c99116b8b1b 100644
--- a/src/gallium/drivers/i965/brw_sf.h
+++ b/src/gallium/drivers/i965/brw_sf.h
@@ -34,7 +34,6 @@
 #define BRW_SF_H
 
 
-#include "shader/program.h"
 #include "brw_context.h"
 #include "brw_eu.h"
 
diff --git a/src/gallium/drivers/i965/brw_sf_emit.c b/src/gallium/drivers/i965/brw_sf_emit.c
index c98d7ec13a6..4acb2b7d72e 100644
--- a/src/gallium/drivers/i965/brw_sf_emit.c
+++ b/src/gallium/drivers/i965/brw_sf_emit.c
@@ -30,7 +30,7 @@
   */
    
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 #include "brw_defines.h"
 #include "brw_context.h"
diff --git a/src/gallium/drivers/i965/brw_state.h b/src/gallium/drivers/i965/brw_state.h
index b716097bfcd..02657eaba7f 100644
--- a/src/gallium/drivers/i965/brw_state.h
+++ b/src/gallium/drivers/i965/brw_state.h
@@ -157,7 +157,7 @@ void brw_state_cache_bo_delete(struct brw_cache *cache, struct brw_winsys_buffer
 /***********************************************************************
  * brw_state_batch.c
  */
-#define BRW_BATCH_STRUCT(brw, s) intel_batchbuffer_data( brw->intel.batch, (s), sizeof(*(s)), IGNORE_CLIPRECTS)
+#define BRW_BATCH_STRUCT(brw, s) brw_batchbuffer_data( brw->intel.batch, (s), sizeof(*(s)), IGNORE_CLIPRECTS)
 #define BRW_CACHED_BATCH_STRUCT(brw, s) brw_cached_batch_struct( brw, (s), sizeof(*(s)) )
 
 GLboolean brw_cached_batch_struct( struct brw_context *brw,
diff --git a/src/gallium/drivers/i965/brw_state_batch.c b/src/gallium/drivers/i965/brw_state_batch.c
index 95687946250..b285837070c 100644
--- a/src/gallium/drivers/i965/brw_state_batch.c
+++ b/src/gallium/drivers/i965/brw_state_batch.c
@@ -32,7 +32,7 @@
 
 
 #include "brw_state.h"
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 
@@ -47,7 +47,7 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
    struct header *newheader = (struct header *)data;
 
    if (brw->emit_state_always) {
-      intel_batchbuffer_data(brw->intel.batch, data, sz, IGNORE_CLIPRECTS);
+      brw_batchbuffer_data(brw->intel.batch, data, sz, IGNORE_CLIPRECTS);
       return GL_TRUE;
    }
 
@@ -74,7 +74,7 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
 
  emit:
    memcpy(item->header, newheader, sz);
-   intel_batchbuffer_data(brw->intel.batch, data, sz, IGNORE_CLIPRECTS);
+   brw_batchbuffer_data(brw->intel.batch, data, sz, IGNORE_CLIPRECTS);
    return GL_TRUE;
 }
 
diff --git a/src/gallium/drivers/i965/brw_state_cache.c b/src/gallium/drivers/i965/brw_state_cache.c
index 91d0f802976..1b5f27cc160 100644
--- a/src/gallium/drivers/i965/brw_state_cache.c
+++ b/src/gallium/drivers/i965/brw_state_cache.c
@@ -57,7 +57,7 @@
  */
 
 #include "brw_state.h"
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 /* XXX: Fixme - have to include these to get the sizes of the prog_key
  * structs:
diff --git a/src/gallium/drivers/i965/brw_state_upload.c b/src/gallium/drivers/i965/brw_state_upload.c
index b68b6cb21a9..842380e38f1 100644
--- a/src/gallium/drivers/i965/brw_state_upload.c
+++ b/src/gallium/drivers/i965/brw_state_upload.c
@@ -33,7 +33,7 @@
 
 #include "brw_context.h"
 #include "brw_state.h"
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 
 /* This is used to initialize brw->state.atoms[].  We could use this
  * list directly except for a single atom, brw_constant_buffer, which
diff --git a/src/gallium/drivers/i965/brw_tex_layout.c b/src/gallium/drivers/i965/brw_tex_layout.c
index 75cdc189121..813cd31f49c 100644
--- a/src/gallium/drivers/i965/brw_tex_layout.c
+++ b/src/gallium/drivers/i965/brw_tex_layout.c
@@ -47,7 +47,7 @@ GLboolean brw_miptree_layout(struct brw_context *brw,
 
    switch (mt->target) {
    case GL_TEXTURE_CUBE_MAP:
-      if (IS_IGDNG(intel->intelScreen->deviceID)) {
+      if (IS_IGDNG(brw->brw_screen->pci_id)) {
           GLuint align_h = 2, align_w = 4;
           GLuint level;
           GLuint x = 0;
diff --git a/src/gallium/drivers/i965/brw_urb.c b/src/gallium/drivers/i965/brw_urb.c
index 8c6f4355a6e..18d79c5ebbb 100644
--- a/src/gallium/drivers/i965/brw_urb.c
+++ b/src/gallium/drivers/i965/brw_urb.c
@@ -31,7 +31,7 @@
         
 
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
diff --git a/src/gallium/drivers/i965/brw_util.h b/src/gallium/drivers/i965/brw_util.h
index 37c3acbc118..b5f9a36e7bc 100644
--- a/src/gallium/drivers/i965/brw_util.h
+++ b/src/gallium/drivers/i965/brw_util.h
@@ -36,9 +36,8 @@
 #include "brw_types.h"
 
 extern GLuint brw_count_bits( GLuint val );
-extern GLuint brw_parameter_list_state_flags(struct gl_program_parameter_list *paramList);
-extern GLuint brw_translate_blend_factor( GLenum factor );
-extern GLuint brw_translate_blend_equation( GLenum mode );
+extern GLuint brw_translate_blend_factor( unsigned factor );
+extern GLuint brw_translate_blend_equation( unsigned mode );
 
 
diff --git a/src/gallium/drivers/i965/brw_vs.c b/src/gallium/drivers/i965/brw_vs.c
index 97e523c3ee7..dcd687ac34a 100644
--- a/src/gallium/drivers/i965/brw_vs.c
+++ b/src/gallium/drivers/i965/brw_vs.c
@@ -34,7 +34,6 @@
 #include "brw_vs.h"
 #include "brw_util.h"
 #include "brw_state.h"
-#include "shader/prog_print.h"
 
 
@@ -113,7 +112,7 @@ static void brw_upload_vs_prog(struct brw_context *brw)
  */
 const struct brw_tracked_state brw_vs_prog = {
    .dirty = {
-      .mesa  = PIPE_NEW_UCP | PIPE_NEW_RAST,
+      .mesa  = PIPE_NEW_CLIP | PIPE_NEW_RAST,
       .brw   = BRW_NEW_VERTEX_PROGRAM,
       .cache = 0
    },
diff --git a/src/gallium/drivers/i965/brw_vs.h b/src/gallium/drivers/i965/brw_vs.h
index 4a591365c98..54f7d7d7c49 100644
--- a/src/gallium/drivers/i965/brw_vs.h
+++ b/src/gallium/drivers/i965/brw_vs.h
@@ -36,7 +36,6 @@
 
 #include "brw_context.h"
 #include "brw_eu.h"
-#include "shader/program.h"
 
 
 struct brw_vs_prog_key {
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
index 6adb7430171..e9469442953 100644
--- a/src/gallium/drivers/i965/brw_vs_emit.c
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -192,7 +192,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 					     BRW_WIDTH_8,
 					     BRW_HORIZONTAL_STRIDE_1,
 					     BRW_SWIZZLE_XXXX,
-					     WRITEMASK_X);
+					     BRW_WRITEMASK_X);
       reg++;
    }
 
@@ -487,7 +487,7 @@ static void emit_exp_noalias( struct brw_vs_compile *c,
    struct brw_compile *p = &c->func;
    
 
-   if (dst.dw1.bits.writemask & WRITEMASK_X) {
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_X) {
       struct brw_reg tmp = get_tmp(c);
       struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
 
@@ -499,23 +499,23 @@ static void emit_exp_noalias( struct brw_vs_compile *c,
       /* Adjust exponent for floating point: 
        * exp += 127 
        */
-      brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
+      brw_ADD(p, brw_writemask(tmp_d, BRW_WRITEMASK_X), tmp_d, brw_imm_d(127));
 
       /* Install exponent and sign.  
        * Excess drops off the edge: 
        */
-      brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X), 
+      brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), BRW_WRITEMASK_X), 
 	      tmp_d, brw_imm_d(23));
 
       release_tmp(c, tmp);
    }
 
-   if (dst.dw1.bits.writemask & WRITEMASK_Y) {
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_Y) {
       /* result[1] = arg0.x - floor(arg0.x) */
-      brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
+      brw_FRC(p, brw_writemask(dst, BRW_WRITEMASK_Y), brw_swizzle1(arg0, 0));
    }
    
-   if (dst.dw1.bits.writemask & WRITEMASK_Z) {
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z) {
       /* As with the LOG instruction, we might be better off just
        * doing a taylor expansion here, seeing as we have to do all
        * the prep work.
@@ -525,14 +525,14 @@ static void emit_exp_noalias( struct brw_vs_compile *c,
        */
       emit_math1(c, 
 		 BRW_MATH_FUNCTION_EXP, 
-		 brw_writemask(dst, WRITEMASK_Z),
+		 brw_writemask(dst, BRW_WRITEMASK_Z),
 		 brw_swizzle1(arg0, 0), 
 		 BRW_MATH_PRECISION_FULL);
    }  
 
-   if (dst.dw1.bits.writemask & WRITEMASK_W) {
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_W) {
       /* result[3] = 1.0; */
-      brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
+      brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_W), brw_imm_f(1));
    }
 }
 
@@ -562,36 +562,36 @@ static void emit_log_noalias( struct brw_vs_compile *c,
     * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
     * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
     */
-   if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_XZ) {
       brw_AND(p, 
-	      brw_writemask(tmp_ud, WRITEMASK_X),
+	      brw_writemask(tmp_ud, BRW_WRITEMASK_X),
 	      brw_swizzle1(arg0_ud, 0),
 	      brw_imm_ud((1U<<31)-1));
 
       brw_SHR(p, 
-	      brw_writemask(tmp_ud, WRITEMASK_X), 
+	      brw_writemask(tmp_ud, BRW_WRITEMASK_X), 
 	      tmp_ud,
 	      brw_imm_ud(23));
 
       brw_ADD(p, 
-	      brw_writemask(tmp, WRITEMASK_X), 
+	      brw_writemask(tmp, BRW_WRITEMASK_X), 
 	      retype(tmp_ud, BRW_REGISTER_TYPE_D),	/* does it matter? */
 	      brw_imm_d(-127));
    }
 
-   if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_YZ) {
       brw_AND(p, 
-	      brw_writemask(tmp_ud, WRITEMASK_Y),
+	      brw_writemask(tmp_ud, BRW_WRITEMASK_Y),
 	      brw_swizzle1(arg0_ud, 0),
 	      brw_imm_ud((1<<23)-1));
 
       brw_OR(p, 
-	     brw_writemask(tmp_ud, WRITEMASK_Y), 
+	     brw_writemask(tmp_ud, BRW_WRITEMASK_Y), 
 	     tmp_ud,
 	     brw_imm_ud(127<<23));
    }
    
-   if (dst.dw1.bits.writemask & WRITEMASK_Z) {
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z) {
       /* result[2] = result[0] + LOG2(result[1]); */
 
       /* Why bother?  The above is just a hint how to do this with a
@@ -606,19 +606,19 @@ static void emit_log_noalias( struct brw_vs_compile *c,
        */
       emit_math1(c, 
 		 BRW_MATH_FUNCTION_LOG, 
-		 brw_writemask(tmp, WRITEMASK_Z), 
+		 brw_writemask(tmp, BRW_WRITEMASK_Z), 
 		 brw_swizzle1(tmp, 1), 
 		 BRW_MATH_PRECISION_FULL);
       
       brw_ADD(p, 
-	      brw_writemask(tmp, WRITEMASK_Z), 
+	      brw_writemask(tmp, BRW_WRITEMASK_Z), 
 	      brw_swizzle1(tmp, 2), 
 	      brw_swizzle1(tmp, 0));
    }  
 
-   if (dst.dw1.bits.writemask & WRITEMASK_W) {
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_W) {
       /* result[3] = 1.0; */
-      brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
+      brw_MOV(p, brw_writemask(tmp, BRW_WRITEMASK_W), brw_imm_f(1));
    }
 
    if (need_tmp) {
@@ -639,14 +639,14 @@ static void emit_dst_noalias( struct brw_vs_compile *c,
 
    /* There must be a better way to do this: 
     */
-   if (dst.dw1.bits.writemask & WRITEMASK_X)
-      brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
-   if (dst.dw1.bits.writemask & WRITEMASK_Y)
-      brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
-   if (dst.dw1.bits.writemask & WRITEMASK_Z)
-      brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
-   if (dst.dw1.bits.writemask & WRITEMASK_W)
-      brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_X)
+      brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_X), brw_imm_f(1.0));
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_Y)
+      brw_MUL(p, brw_writemask(dst, BRW_WRITEMASK_Y), arg0, arg1);
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z)
+      brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_Z), arg0);
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_W)
+      brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_W), arg1);
 }
 
 
@@ -672,8 +672,8 @@ static void emit_lit_noalias( struct brw_vs_compile *c,
    if (need_tmp) 
       tmp = get_tmp(c);
    
-   brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0)); 
-   brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1)); 
+   brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_YZ), brw_imm_f(0)); 
+   brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_XW), brw_imm_f(1)); 
 
    /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
     * to get all channels active inside the IF.  In the clipping code
@@ -683,15 +683,15 @@ static void emit_lit_noalias( struct brw_vs_compile *c,
    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
    if_insn = brw_IF(p, BRW_EXECUTE_8);
    {
-      brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
+      brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_Y), brw_swizzle1(arg0,0));
 
       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
-      brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z),  brw_swizzle1(arg0,1));
+      brw_MOV(p, brw_writemask(tmp, BRW_WRITEMASK_Z),  brw_swizzle1(arg0,1));
       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 
       emit_math2(c, 
 		 BRW_MATH_FUNCTION_POW, 
-		 brw_writemask(dst, WRITEMASK_Z),
+		 brw_writemask(dst, BRW_WRITEMASK_Z),
 		 brw_swizzle1(tmp, 2),
 		 brw_swizzle1(arg0, 3),
 		 BRW_MATH_PRECISION_PARTIAL);      
@@ -1045,7 +1045,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
    /* ndc = 1.0 / pos.w */
    emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
    /* ndc.xyz = pos * ndc */
-   brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
+   brw_MUL(p, brw_writemask(ndc, BRW_WRITEMASK_XYZ), pos, ndc);
 
    /* Update the header for point size, user clipping flags, and -ve rhw
     * workaround.
@@ -1062,14 +1062,14 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 
       if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
 	 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
-	 brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
-	 brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
+	 brw_MUL(p, brw_writemask(header1, BRW_WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
+	 brw_AND(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
       }
 
       for (i = 0; i < c->key.nr_userclip; i++) {
 	 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
 	 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
-	 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
+	 brw_OR(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(1<<i));
 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
       }
 
@@ -1089,7 +1089,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 		 brw_swizzle1(ndc, 3),
 		 brw_imm_f(0));
    
-	 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
+	 brw_OR(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(1<<6));
 	 brw_MOV(p, ndc, brw_imm_f(0));
 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
       }
@@ -1139,7 +1139,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 		 eot, 		/* writes complete */
 		 0, 		/* urb destination offset */
 		 BRW_URB_SWIZZLE_INTERLEAVE);
-
+!
    if (c->first_overflow_output > 0) {
       /* Not all of the vertex outputs/results fit into the MRF.
        * Move the overflowed attributes from the GRF to the MRF and
diff --git a/src/gallium/drivers/i965/brw_winsys.h b/src/gallium/drivers/i965/brw_winsys.h
new file mode 100644
index 00000000000..2142db5a4db
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_winsys.h
@@ -0,0 +1,243 @@
+/**************************************************************************
+ *
+ * Copyright © 2009 Jakob Bornecrantz
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef BRW_WINSYS_H
+#define BRW_WINSYS_H
+
+#include "pipe/p_compiler.h"
+
+struct brw_winsys;
+struct pipe_fence_handle;
+
+/* This currently just wraps dri_bo:
+ */
+struct brw_winsys_buffer {
+   struct brw_winsys_screen *sws;
+   void *bo;
+   unsigned offset;
+};
+
+enum brw_buffer_usage {
+   I915_GEM_DOMAIN_RENDER,
+   I915_GEM_DOMAIN_SAMPLER,
+   I915_GEM_DOMAIN_VERTEX,
+   I915_GEM_DOMAIN_INSTRUCTION,
+
+
+   /* XXX: migrate from domains to explicit usage cases, eg below:
+    */
+
+   /* use on textures */
+   BRW_USAGE_RENDER    = 0x01,
+   BRW_USAGE_SAMPLER   = 0x02,
+   BRW_USAGE_2D_TARGET = 0x04,
+   BRW_USAGE_2D_SOURCE = 0x08,
+   /* use on vertex */
+   BRW_USAGE_VERTEX    = 0x10,
+};
+
+enum brw_buffer_type
+{
+   BRW_BUFFER_TYPE_TEXTURE,
+   BRW_BUFFER_TYPE_SCANOUT, /**< a texture used for scanning out from */
+   BRW_BUFFER_TYPE_VERTEX,
+};
+
+
+/* AKA winsys context:
+ */
+struct brw_batchbuffer {
+
+   struct brw_winsys *iws;
+   struct brw_winsys_buffer *buf;
+
+   /**
+    * Values exported to speed up the writing the batchbuffer,
+    * instead of having to go trough a accesor function for
+    * each dword written.
+    */
+   /*{@*/
+   uint8_t *map;
+   uint8_t *ptr;
+   size_t size;
+
+   size_t relocs;
+   size_t max_relocs;
+   /*@}*/
+};
+
+struct brw_winsys_screen {
+
+   /**
+    * Batchbuffer functions.
+    */
+   /*@{*/
+   /**
+    * Create a new batchbuffer.
+    */
+   struct brw_batchbuffer *(*batchbuffer_create)(struct brw_winsys_screen *iws);
+
+   /**
+    * Emit a relocation to a buffer.
+    * Target position in batchbuffer is the same as ptr.
+    */
+   int (*batchbuffer_reloc)(struct brw_batchbuffer *batch,
+			    unsigned offset,
+                            struct brw_winsys_buffer *reloc,
+			    unsigned pre_add,
+                            enum brw_buffer_usage usage);
+
+   /**
+    * Flush a bufferbatch.
+    */
+   void (*batchbuffer_flush)(struct brw_batchbuffer *batch,
+                             struct pipe_fence_handle **fence);
+
+   /**
+    * Destroy a batchbuffer.
+    */
+   void (*batchbuffer_destroy)(struct brw_batchbuffer *batch);
+   /*@}*/
+
+
+   /**
+    * Buffer functions.
+    */
+   /*@{*/
+   /**
+    * Create a buffer.
+    */
+   struct brw_winsys_buffer *(*buffer_create)(struct brw_winsys *iws,
+					      unsigned size, 
+					      unsigned alignment,
+					      enum brw_buffer_type type);
+
+
+   /* Reference and unreference buffers:
+    */
+   void (*bo_reference)( struct brw_winsys_buffer *buffer );
+   void (*bo_unreference)( struct brw_winsys_buffer *buffer );
+   void (*bo_emit_reloc)( struct brw_winsys_buffer *buffer,
+			  unsigned domain,
+			  unsigned a,
+			  unsigned b,
+			  unsigned offset,
+			  struct brw_winsys_buffer *b2);
+
+   /**
+    * Map a buffer.
+    */
+   void *(*buffer_map)(struct brw_winsys *iws,
+                       struct brw_winsys_buffer *buffer,
+                       boolean write);
+
+   /**
+    * Unmap a buffer.
+    */
+   void (*buffer_unmap)(struct brw_winsys *iws,
+                        struct brw_winsys_buffer *buffer);
+
+   /**
+    * Write to a buffer.
+    *
+    * Arguments follows pipe_buffer_write.
+    */
+   int (*buffer_write)(struct brw_winsys *iws,
+                       struct brw_winsys_buffer *dst,
+                       size_t offset,
+                       size_t size,
+                       const void *data);
+
+   void (*buffer_destroy)(struct brw_winsys *iws,
+                          struct brw_winsys_buffer *buffer);
+   /*@}*/
+
+
+   /**
+    * Fence functions.
+    */
+   /*@{*/
+   /**
+    * Reference fence and set ptr to fence.
+    */
+   void (*fence_reference)(struct brw_winsys *iws,
+                           struct pipe_fence_handle **ptr,
+                           struct pipe_fence_handle *fence);
+
+   /**
+    * Check if a fence has finished.
+    */
+   int (*fence_signalled)(struct brw_winsys *iws,
+                          struct pipe_fence_handle *fence);
+
+   /**
+    * Wait on a fence to finish.
+    */
+   int (*fence_finish)(struct brw_winsys *iws,
+                       struct pipe_fence_handle *fence);
+   /*@}*/
+
+
+   /**
+    * Destroy the winsys.
+    */
+   void (*destroy)(struct brw_winsys *iws);
+};
+
+
+/**
+ * Create i915 pipe_screen.
+ */
+struct pipe_screen *i915_create_screen(struct brw_winsys *iws, unsigned pci_id);
+
+/**
+ * Create a i915 pipe_context.
+ */
+struct pipe_context *i915_create_context(struct pipe_screen *screen);
+
+/**
+ * Get the brw_winsys buffer backing the texture.
+ *
+ * TODO UGLY
+ */
+struct pipe_texture;
+boolean i915_get_texture_buffer_brw(struct pipe_texture *texture,
+				    struct brw_winsys_buffer **buffer,
+				    unsigned *stride);
+
+/**
+ * Wrap a brw_winsys buffer with a texture blanket.
+ *
+ * TODO UGLY
+ */
+struct pipe_texture * i915_texture_blanket_brw(struct pipe_screen *screen,
+                                                 struct pipe_texture *tmplt,
+                                                 unsigned pitch,
+                                                 struct brw_winsys_buffer *buffer);
+
+
+
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_wm.h b/src/gallium/drivers/i965/brw_wm.h
index 756a6801507..18775830f99 100644
--- a/src/gallium/drivers/i965/brw_wm.h
+++ b/src/gallium/drivers/i965/brw_wm.h
@@ -34,7 +34,6 @@
 #define BRW_WM_H
 
 
-#include "shader/prog_instruction.h"
 #include "brw_context.h"
 #include "brw_eu.h"
 
diff --git a/src/gallium/drivers/i965/brw_wm_debug.c b/src/gallium/drivers/i965/brw_wm_debug.c
index 220821087c1..c6659646f2c 100644
--- a/src/gallium/drivers/i965/brw_wm_debug.c
+++ b/src/gallium/drivers/i965/brw_wm_debug.c
@@ -98,7 +98,7 @@ void brw_wm_print_insn( struct brw_wm_compile *c,
    }
    _mesa_printf("]");
 
-   if (inst->writemask != WRITEMASK_XYZW)
+   if (inst->writemask != BRW_WRITEMASK_XYZW)
       _mesa_printf(".%s%s%s%s", 
 		   GET_BIT(inst->writemask, 0) ? "x" : "",
 		   GET_BIT(inst->writemask, 1) ? "y" : "",
diff --git a/src/gallium/drivers/i965/brw_wm_emit.c b/src/gallium/drivers/i965/brw_wm_emit.c
index fec33f74ebc..7df9b79d7a2 100644
--- a/src/gallium/drivers/i965/brw_wm_emit.c
+++ b/src/gallium/drivers/i965/brw_wm_emit.c
@@ -72,14 +72,14 @@ static void emit_pixel_xy(struct brw_compile *p,
    /* Calculate pixel centers by adding 1 or 0 to each of the
     * micro-tile coordinates passed in r1.
     */
-   if (mask & WRITEMASK_X) {
+   if (mask & BRW_WRITEMASK_X) {
       brw_ADD(p,
 	      vec16(retype(dst[0], BRW_REGISTER_TYPE_UW)),
 	      stride(suboffset(r1_uw, 4), 2, 4, 0),
 	      brw_imm_v(0x10101010));
    }
 
-   if (mask & WRITEMASK_Y) {
+   if (mask & BRW_WRITEMASK_Y) {
       brw_ADD(p,
 	      vec16(retype(dst[1], BRW_REGISTER_TYPE_UW)),
 	      stride(suboffset(r1_uw,5), 2, 4, 0),
@@ -101,14 +101,14 @@ static void emit_delta_xy(struct brw_compile *p,
    /* Calc delta X,Y by subtracting origin in r1 from the pixel
     * centers.
     */
-   if (mask & WRITEMASK_X) {
+   if (mask & BRW_WRITEMASK_X) {
       brw_ADD(p,
 	      dst[0],
 	      retype(arg0[0], BRW_REGISTER_TYPE_UW),
 	      negate(r1));
    }
 
-   if (mask & WRITEMASK_Y) {
+   if (mask & BRW_WRITEMASK_Y) {
       brw_ADD(p,
 	      dst[1],
 	      retype(arg0[1], BRW_REGISTER_TYPE_UW),
@@ -124,7 +124,7 @@ static void emit_wpos_xy(struct brw_wm_compile *c,
 {
    struct brw_compile *p = &c->func;
 
-   if (mask & WRITEMASK_X) {
+   if (mask & BRW_WRITEMASK_X) {
       /* X' = X */
       brw_MOV(p,
 	      dst[0],
@@ -133,7 +133,7 @@ static void emit_wpos_xy(struct brw_wm_compile *c,
 
    /* XXX: is this needed any more, or is this a NOOP?
     */
-   if (mask & WRITEMASK_Y) {
+   if (mask & BRW_WRITEMASK_Y) {
       /* Y' = height - 1 - Y */
       brw_ADD(p,
 	      dst[1],
@@ -152,7 +152,7 @@ static void emit_pixel_w( struct brw_compile *p,
    /* Don't need this if all you are doing is interpolating color, for
     * instance.
     */
-   if (mask & WRITEMASK_W) {      
+   if (mask & BRW_WRITEMASK_W) {      
       struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
 
       /* Calc 1/w - just linterp wpos[3] optimized by putting the
@@ -255,7 +255,7 @@ static void emit_frontfacing( struct brw_compile *p,
    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
    GLuint i;
 
-   if (!(mask & WRITEMASK_XYZW))
+   if (!(mask & BRW_WRITEMASK_XYZW))
       return;
 
    for (i = 0; i < 4; i++) {
@@ -321,26 +321,26 @@ void emit_ddxy(struct brw_compile *p,
 			   BRW_VERTICAL_STRIDE_2,
 			   BRW_WIDTH_2,
 			   BRW_HORIZONTAL_STRIDE_0,
-			   BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+			   BRW_SWIZZLE_XYZW, BRW_WRITEMASK_XYZW);
 	    src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 			   BRW_REGISTER_TYPE_F,
 			   BRW_VERTICAL_STRIDE_2,
 			   BRW_WIDTH_2,
 			   BRW_HORIZONTAL_STRIDE_0,
-			   BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+			   BRW_SWIZZLE_XYZW, BRW_WRITEMASK_XYZW);
 	 } else {
 	    src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
 			   BRW_REGISTER_TYPE_F,
 			   BRW_VERTICAL_STRIDE_4,
 			   BRW_WIDTH_4,
 			   BRW_HORIZONTAL_STRIDE_0,
-			   BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+			   BRW_SWIZZLE_XYZW, BRW_WRITEMASK_XYZW);
 	    src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
 			   BRW_REGISTER_TYPE_F,
 			   BRW_VERTICAL_STRIDE_4,
 			   BRW_WIDTH_4,
 			   BRW_HORIZONTAL_STRIDE_0,
-			   BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+			   BRW_SWIZZLE_XYZW, BRW_WRITEMASK_XYZW);
 	 }
 	 brw_ADD(p, dst[i], src0, negate(src1));
       }
@@ -611,12 +611,12 @@ static void emit_dp3( struct brw_compile *p,
 		      const struct brw_reg *arg0,
 		      const struct brw_reg *arg1 )
 {
-   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+   int dst_chan = _mesa_ffs(mask & BRW_WRITEMASK_XYZW) - 1;
 
-   if (!(mask & WRITEMASK_XYZW))
+   if (!(mask & BRW_WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert(is_power_of_two(mask & WRITEMASK_XYZW));
+   assert(is_power_of_two(mask & BRW_WRITEMASK_XYZW));
 
    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
@@ -633,12 +633,12 @@ static void emit_dp4( struct brw_compile *p,
 		      const struct brw_reg *arg0,
 		      const struct brw_reg *arg1 )
 {
-   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+   int dst_chan = _mesa_ffs(mask & BRW_WRITEMASK_XYZW) - 1;
 
-   if (!(mask & WRITEMASK_XYZW))
+   if (!(mask & BRW_WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert(is_power_of_two(mask & WRITEMASK_XYZW));
+   assert(is_power_of_two(mask & BRW_WRITEMASK_XYZW));
 
    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
@@ -656,12 +656,12 @@ static void emit_dph( struct brw_compile *p,
 		      const struct brw_reg *arg0,
 		      const struct brw_reg *arg1 )
 {
-   const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+   const int dst_chan = _mesa_ffs(mask & BRW_WRITEMASK_XYZW) - 1;
 
-   if (!(mask & WRITEMASK_XYZW))
+   if (!(mask & BRW_WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert(is_power_of_two(mask & WRITEMASK_XYZW));
+   assert(is_power_of_two(mask & BRW_WRITEMASK_XYZW));
 
    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
@@ -681,7 +681,7 @@ static void emit_xpd( struct brw_compile *p,
 {
    GLuint i;
 
-   assert(!(mask & WRITEMASK_W) == WRITEMASK_X);
+   assert(!(mask & BRW_WRITEMASK_W) == BRW_WRITEMASK_X);
    
    for (i = 0 ; i < 3; i++) {
       if (mask & (1<<i)) {
@@ -704,12 +704,12 @@ static void emit_math1( struct brw_compile *p,
 			GLuint mask,
 			const struct brw_reg *arg0 )
 {
-   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+   int dst_chan = _mesa_ffs(mask & BRW_WRITEMASK_XYZW) - 1;
 
-   if (!(mask & WRITEMASK_XYZW))
+   if (!(mask & BRW_WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert(is_power_of_two(mask & WRITEMASK_XYZW));
+   assert(is_power_of_two(mask & BRW_WRITEMASK_XYZW));
 
    brw_MOV(p, brw_message_reg(2), arg0[0]);
 
@@ -732,12 +732,12 @@ static void emit_math2( struct brw_compile *p,
 			const struct brw_reg *arg0,
 			const struct brw_reg *arg1)
 {
-   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+   int dst_chan = _mesa_ffs(mask & BRW_WRITEMASK_XYZW) - 1;
 
-   if (!(mask & WRITEMASK_XYZW))
+   if (!(mask & BRW_WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert(is_power_of_two(mask & WRITEMASK_XYZW));
+   assert(is_power_of_two(mask & BRW_WRITEMASK_XYZW));
 
    brw_push_insn_state(p);
 
@@ -795,17 +795,17 @@ static void emit_tex( struct brw_wm_compile *c,
     */
    switch (inst->tex_idx) {
    case TEXTURE_1D_INDEX:
-      emit = WRITEMASK_X;
+      emit = BRW_WRITEMASK_X;
       nr = 1;
       break;
    case TEXTURE_2D_INDEX:
    case TEXTURE_RECT_INDEX:
-      emit = WRITEMASK_XY;
+      emit = BRW_WRITEMASK_XY;
       nr = 2;
       break;
    case TEXTURE_3D_INDEX:
    case TEXTURE_CUBE_INDEX:
-      emit = WRITEMASK_XYZ;
+      emit = BRW_WRITEMASK_XYZ;
       nr = 3;
       break;
    default:
@@ -815,7 +815,7 @@ static void emit_tex( struct brw_wm_compile *c,
 
    if (inst->tex_shadow) {
       nr = 4;
-      emit |= WRITEMASK_W;
+      emit |= BRW_WRITEMASK_W;
    }
 
    msgLength = 1;
@@ -922,18 +922,18 @@ static void emit_lit( struct brw_compile *p,
 		      GLuint mask,
 		      const struct brw_reg *arg0 )
 {
-   assert((mask & WRITEMASK_XW) == 0);
+   assert((mask & BRW_WRITEMASK_XW) == 0);
 
-   if (mask & WRITEMASK_Y) {
+   if (mask & BRW_WRITEMASK_Y) {
       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
       brw_MOV(p, dst[1], arg0[0]);
       brw_set_saturate(p, 0);
    }
 
-   if (mask & WRITEMASK_Z) {
+   if (mask & BRW_WRITEMASK_Z) {
       emit_math2(p, BRW_MATH_FUNCTION_POW,
 		 &dst[2],
-		 WRITEMASK_X | (mask & SATURATE),
+		 BRW_WRITEMASK_X | (mask & SATURATE),
 		 &arg0[1],
 		 &arg0[3]);
    }
@@ -944,10 +944,10 @@ static void emit_lit( struct brw_compile *p,
     */
    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
    {
-      if (mask & WRITEMASK_Y) 
+      if (mask & BRW_WRITEMASK_Y) 
 	 brw_MOV(p, dst[1], brw_imm_f(0));
 
-      if (mask & WRITEMASK_Z) 
+      if (mask & BRW_WRITEMASK_Z) 
 	 brw_MOV(p, dst[2], brw_imm_f(0)); 
    }
    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
@@ -1414,10 +1414,10 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 /* There is an scs math function, but it would need some
 	  * fixup for 16-element execution.
 	  */
-	 if (dst_flags & WRITEMASK_X)
-	    emit_math1(p, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
-	 if (dst_flags & WRITEMASK_Y)
-	    emit_math1(p, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
+	 if (dst_flags & BRW_WRITEMASK_X)
+	    emit_math1(p, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|BRW_WRITEMASK_X, args[0]);
+	 if (dst_flags & BRW_WRITEMASK_Y)
+	    emit_math1(p, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|BRW_WRITEMASK_X, args[0]);
 	 break;
 
       case OPCODE_POW:
diff --git a/src/gallium/drivers/i965/brw_wm_fp.c b/src/gallium/drivers/i965/brw_wm_fp.c
index 5f47d86f717..be240031c7c 100644
--- a/src/gallium/drivers/i965/brw_wm_fp.c
+++ b/src/gallium/drivers/i965/brw_wm_fp.c
@@ -115,7 +115,7 @@ static struct prog_dst_register dst_reg(GLuint file, GLuint idx)
    struct prog_dst_register reg;
    reg.File = file;
    reg.Index = idx;
-   reg.WriteMask = WRITEMASK_XYZW;
+   reg.WriteMask = BRW_WRITEMASK_XYZW;
    reg.RelAddr = 0;
    reg.CondMask = COND_TR;
    reg.CondSwizzle = 0;
@@ -249,7 +249,7 @@ static struct prog_src_register get_pixel_xy( struct brw_wm_compile *c )
        */
       emit_op(c,
 	      WM_PIXELXY,
-	      dst_mask(pixel_xy, WRITEMASK_XY),
+	      dst_mask(pixel_xy, BRW_WRITEMASK_XY),
 	      0,
 	      payload_r0_depth,
 	      src_undef(),
@@ -272,7 +272,7 @@ static struct prog_src_register get_delta_xy( struct brw_wm_compile *c )
        */
       emit_op(c,
 	      WM_DELTAXY,
-	      dst_mask(delta_xy, WRITEMASK_XY),
+	      dst_mask(delta_xy, BRW_WRITEMASK_XY),
 	      0,
 	      pixel_xy, 
 	      payload_r0_depth,
@@ -295,7 +295,7 @@ static struct prog_src_register get_pixel_w( struct brw_wm_compile *c )
        */
       emit_op(c,
 	      WM_PIXELW,
-	      dst_mask(pixel_w, WRITEMASK_W),
+	      dst_mask(pixel_w, BRW_WRITEMASK_W),
 	      0,
 	      interp_wpos,
 	      deltas, 
@@ -327,13 +327,13 @@ static void emit_interp( struct brw_wm_compile *c,
        */
       emit_op(c,
 	      WM_WPOSXY,
-	      dst_mask(dst, WRITEMASK_XY),
+	      dst_mask(dst, BRW_WRITEMASK_XY),
 	      0,
 	      get_pixel_xy(c),
 	      src_undef(),
 	      src_undef());
       
-      dst = dst_mask(dst, WRITEMASK_ZW);
+      dst = dst_mask(dst, BRW_WRITEMASK_ZW);
 
       /* PROGRAM_INPUT.attr.xyzw = INTERP payload.interp[attr].x, deltas.xyw
        */
@@ -370,7 +370,7 @@ static void emit_interp( struct brw_wm_compile *c,
       /* Interpolate the fog coordinate */
       emit_op(c,
 	      WM_PINTERP,
-	      dst_mask(dst, WRITEMASK_X),
+	      dst_mask(dst, BRW_WRITEMASK_X),
 	      0,
 	      interp,
 	      deltas,
@@ -378,7 +378,7 @@ static void emit_interp( struct brw_wm_compile *c,
 
       emit_op(c,
 	      TGSI_OPCODE_MOV,
-	      dst_mask(dst, WRITEMASK_YZW),
+	      dst_mask(dst, BRW_WRITEMASK_YZW),
 	      0,
 	      src_swizzle(interp,
 			  SWIZZLE_ZERO,
@@ -393,7 +393,7 @@ static void emit_interp( struct brw_wm_compile *c,
       /* XXX review/test this case */
       emit_op(c,
               WM_FRONTFACING,
-              dst_mask(dst, WRITEMASK_X),
+              dst_mask(dst, BRW_WRITEMASK_X),
               0,
               src_undef(),
               src_undef(),
@@ -404,7 +404,7 @@ static void emit_interp( struct brw_wm_compile *c,
       /* XXX review/test this case */
       emit_op(c,
 	      WM_PINTERP,
-	      dst_mask(dst, WRITEMASK_XY),
+	      dst_mask(dst, BRW_WRITEMASK_XY),
 	      0,
 	      interp,
 	      deltas,
@@ -412,7 +412,7 @@ static void emit_interp( struct brw_wm_compile *c,
 
       emit_op(c,
 	      TGSI_OPCODE_MOV,
-	      dst_mask(dst, WRITEMASK_ZW),
+	      dst_mask(dst, BRW_WRITEMASK_ZW),
 	      0,
 	      src_swizzle(interp,
 			  SWIZZLE_ZERO,
@@ -518,19 +518,19 @@ static void precalc_dst( struct brw_wm_compile *c,
    struct prog_src_register src1 = inst->SrcReg[1];
    struct prog_dst_register dst = inst->DstReg;
    
-   if (dst.WriteMask & WRITEMASK_Y) {      
+   if (dst.WriteMask & BRW_WRITEMASK_Y) {      
       /* dst.y = mul src0.y, src1.y
        */
       emit_op(c,
 	      TGSI_OPCODE_MUL,
-	      dst_mask(dst, WRITEMASK_Y),
+	      dst_mask(dst, BRW_WRITEMASK_Y),
 	      inst->SaturateMode,
 	      src0,
 	      src1,
 	      src_undef());
    }
 
-   if (dst.WriteMask & WRITEMASK_XZ) {
+   if (dst.WriteMask & BRW_WRITEMASK_XZ) {
       struct prog_instruction *swz;
       GLuint z = GET_SWZ(src0.Swizzle, Z);
 
@@ -538,7 +538,7 @@ static void precalc_dst( struct brw_wm_compile *c,
        */
       swz = emit_op(c,
 		    TGSI_OPCODE_MOV,
-		    dst_mask(dst, WRITEMASK_XZ),
+		    dst_mask(dst, BRW_WRITEMASK_XZ),
 		    inst->SaturateMode,
 		    src_swizzle(src0, SWIZZLE_ONE, z, z, z),
 		    src_undef(),
@@ -546,12 +546,12 @@ static void precalc_dst( struct brw_wm_compile *c,
       /* Avoid letting negation flag of src0 affect our 1 constant. */
       swz->SrcReg[0].Negate &= ~NEGATE_X;
    }
-   if (dst.WriteMask & WRITEMASK_W) {
+   if (dst.WriteMask & BRW_WRITEMASK_W) {
       /* dst.w = mov src1.w
        */
       emit_op(c,
 	      TGSI_OPCODE_MOV,
-	      dst_mask(dst, WRITEMASK_W),
+	      dst_mask(dst, BRW_WRITEMASK_W),
 	      inst->SaturateMode,
 	      src1,
 	      src_undef(),
@@ -566,14 +566,14 @@ static void precalc_lit( struct brw_wm_compile *c,
    struct prog_src_register src0 = inst->SrcReg[0];
    struct prog_dst_register dst = inst->DstReg;
    
-   if (dst.WriteMask & WRITEMASK_XW) {
+   if (dst.WriteMask & BRW_WRITEMASK_XW) {
       struct prog_instruction *swz;
 
       /* dst.xw = swz src0.1111
        */
       swz = emit_op(c,
 		    TGSI_OPCODE_MOV,
-		    dst_mask(dst, WRITEMASK_XW),
+		    dst_mask(dst, BRW_WRITEMASK_XW),
 		    0,
 		    src_swizzle1(src0, SWIZZLE_ONE),
 		    src_undef(),
@@ -582,10 +582,10 @@ static void precalc_lit( struct brw_wm_compile *c,
       swz->SrcReg[0].Negate = NEGATE_NONE;
    }
 
-   if (dst.WriteMask & WRITEMASK_YZ) {
+   if (dst.WriteMask & BRW_WRITEMASK_YZ) {
       emit_op(c,
 	      TGSI_OPCODE_LIT,
-	      dst_mask(dst, WRITEMASK_YZ),
+	      dst_mask(dst, BRW_WRITEMASK_YZ),
 	      inst->SaturateMode,
 	      src0,
 	      src_undef(),
@@ -649,7 +649,7 @@ static void precalc_tex( struct brw_wm_compile *c,
 
        /* tmp0 = 1 / tmp1 */
        emit_op(c, TGSI_OPCODE_RCP,
-               dst_mask(tmp0, WRITEMASK_X),
+               dst_mask(tmp0, BRW_WRITEMASK_X),
                0,
                tmp1src,
                src_undef(),
@@ -740,7 +740,7 @@ static void precalc_tex( struct brw_wm_compile *c,
        */
       emit_op(c,
 	      TGSI_OPCODE_ADD,
-	      dst_mask(tmp, WRITEMASK_XYZ),
+	      dst_mask(tmp, BRW_WRITEMASK_XYZ),
 	      0,
 	      tmpsrc,
 	      C0,
@@ -751,7 +751,7 @@ static void precalc_tex( struct brw_wm_compile *c,
 
       emit_op(c,
 	      TGSI_OPCODE_MUL,
-	      dst_mask(tmp, WRITEMASK_Y),
+	      dst_mask(tmp, BRW_WRITEMASK_Y),
 	      0,
 	      tmpsrc,
 	      src_swizzle1(C0, W),
@@ -766,7 +766,7 @@ static void precalc_tex( struct brw_wm_compile *c,
 
       emit_op(c,
 	      TGSI_OPCODE_MAD,
-	      dst_mask(dst, WRITEMASK_XYZ),
+	      dst_mask(dst, BRW_WRITEMASK_XYZ),
 	      0,
 	      swap_uv?src_swizzle(tmpsrc, Z,Z,X,X):src_swizzle(tmpsrc, X,X,Z,Z),
 	      C1,
@@ -776,7 +776,7 @@ static void precalc_tex( struct brw_wm_compile *c,
        */
       emit_op(c,
 	      TGSI_OPCODE_MAD,
-	      dst_mask(dst, WRITEMASK_Y),
+	      dst_mask(dst, BRW_WRITEMASK_Y),
 	      0,
 	      src_swizzle1(tmpsrc, Z),
 	      src_swizzle1(C1, W),
@@ -863,7 +863,7 @@ static void precalc_txp( struct brw_wm_compile *c,
        */
       emit_op(c,
 	      TGSI_OPCODE_RCP,
-	      dst_mask(tmp, WRITEMASK_W),
+	      dst_mask(tmp, BRW_WRITEMASK_W),
 	      0,
 	      src_swizzle1(src0, GET_SWZ(src0.Swizzle, W)),
 	      src_undef(),
@@ -873,7 +873,7 @@ static void precalc_txp( struct brw_wm_compile *c,
        */
       emit_op(c,
 	      TGSI_OPCODE_MUL,
-	      dst_mask(tmp, WRITEMASK_XYZ),
+	      dst_mask(tmp, BRW_WRITEMASK_XYZ),
 	      0,
 	      src0,
 	      src_swizzle1(src_reg_from_dst(tmp), W),
@@ -1053,7 +1053,7 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
 	 out = emit_insn(c, inst);
 	 /* This should probably be done in the parser. 
 	  */
-	 out->DstReg.WriteMask &= WRITEMASK_XY;
+	 out->DstReg.WriteMask &= BRW_WRITEMASK_XY;
 	 break;
 	 
       case TGSI_OPCODE_DST:
@@ -1082,7 +1082,7 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
 	 out = emit_insn(c, inst);
 	 /* This should probably be done in the parser. 
 	  */
-	 out->DstReg.WriteMask &= WRITEMASK_XYZ;
+	 out->DstReg.WriteMask &= BRW_WRITEMASK_XYZ;
 	 break;
 
       case TGSI_OPCODE_KIL: 
diff --git a/src/gallium/drivers/i965/brw_wm_pass0.c b/src/gallium/drivers/i965/brw_wm_pass0.c
index 0c411b57f50..de5f5fe821f 100644
--- a/src/gallium/drivers/i965/brw_wm_pass0.c
+++ b/src/gallium/drivers/i965/brw_wm_pass0.c
@@ -32,7 +32,6 @@
 
 #include "brw_context.h"
 #include "brw_wm.h"
-#include "shader/prog_parameter.h"
 
 
diff --git a/src/gallium/drivers/i965/brw_wm_pass1.c b/src/gallium/drivers/i965/brw_wm_pass1.c
index d940ec09a9e..f2ae3a958f9 100644
--- a/src/gallium/drivers/i965/brw_wm_pass1.c
+++ b/src/gallium/drivers/i965/brw_wm_pass1.c
@@ -91,15 +91,15 @@ static GLuint get_texcoord_mask( GLuint tex_idx )
 {
    switch (tex_idx) {
    case TEXTURE_1D_INDEX:
-      return WRITEMASK_X;
+      return BRW_WRITEMASK_X;
    case TEXTURE_2D_INDEX:
-      return WRITEMASK_XY;
+      return BRW_WRITEMASK_XY;
    case TEXTURE_3D_INDEX:
-      return WRITEMASK_XYZ;
+      return BRW_WRITEMASK_XYZ;
    case TEXTURE_CUBE_INDEX:
-      return WRITEMASK_XYZ;
+      return BRW_WRITEMASK_XYZ;
    case TEXTURE_RECT_INDEX:
-      return WRITEMASK_XY;
+      return BRW_WRITEMASK_XY;
    default: return 0;
    }
 }
@@ -121,16 +121,16 @@ void brw_wm_pass1( struct brw_wm_compile *c )
       GLuint read0, read1, read2;
 
       if (inst->opcode == TGSI_OPCODE_KIL) {
-	 track_arg(c, inst, 0, WRITEMASK_XYZW); /* All args contribute to final */
+	 track_arg(c, inst, 0, BRW_WRITEMASK_XYZW); /* All args contribute to final */
 	 continue;
       }
 
       if (inst->opcode == WM_FB_WRITE) {
-	 track_arg(c, inst, 0, WRITEMASK_XYZW); 
-	 track_arg(c, inst, 1, WRITEMASK_XYZW); 
+	 track_arg(c, inst, 0, BRW_WRITEMASK_XYZW); 
+	 track_arg(c, inst, 1, BRW_WRITEMASK_XYZW); 
 	 if (c->key.source_depth_to_render_target &&
 	     c->key.computes_depth)
-	    track_arg(c, inst, 2, WRITEMASK_Z); 
+	    track_arg(c, inst, 2, BRW_WRITEMASK_Z); 
 	 else
 	    track_arg(c, inst, 2, 0); 
 	 continue;
@@ -191,9 +191,9 @@ void brw_wm_pass1( struct brw_wm_compile *c )
 	 break;
 
       case TGSI_OPCODE_XPD: 
-	 if (writemask & WRITEMASK_X) read0 |= WRITEMASK_YZ;	 
-	 if (writemask & WRITEMASK_Y) read0 |= WRITEMASK_XZ;	 
-	 if (writemask & WRITEMASK_Z) read0 |= WRITEMASK_XY;
+	 if (writemask & BRW_WRITEMASK_X) read0 |= BRW_WRITEMASK_YZ;	 
+	 if (writemask & BRW_WRITEMASK_Y) read0 |= BRW_WRITEMASK_XZ;	 
+	 if (writemask & BRW_WRITEMASK_Z) read0 |= BRW_WRITEMASK_XY;
 	 read1 = read0;
 	 break;
 
@@ -206,12 +206,12 @@ void brw_wm_pass1( struct brw_wm_compile *c )
       case TGSI_OPCODE_SCS:
       case WM_CINTERP:
       case WM_PIXELXY:
-	 read0 = WRITEMASK_X;
+	 read0 = BRW_WRITEMASK_X;
 	 break;
 
       case TGSI_OPCODE_POW:
-	 read0 = WRITEMASK_X;
-	 read1 = WRITEMASK_X;
+	 read0 = BRW_WRITEMASK_X;
+	 read1 = BRW_WRITEMASK_X;
 	 break;
 
       case TGSI_OPCODE_TEX:
@@ -219,57 +219,57 @@ void brw_wm_pass1( struct brw_wm_compile *c )
 	 read0 = get_texcoord_mask(inst->tex_idx);
 
          if (inst->tex_shadow)
-	    read0 |= WRITEMASK_Z;
+	    read0 |= BRW_WRITEMASK_Z;
 	 break;
 
       case TGSI_OPCODE_TXB:
 	 /* Shadow ignored for txb.
 	  */
-	 read0 = get_texcoord_mask(inst->tex_idx) | WRITEMASK_W;
+	 read0 = get_texcoord_mask(inst->tex_idx) | BRW_WRITEMASK_W;
 	 break;
 
       case WM_WPOSXY:
-	 read0 = writemask & WRITEMASK_XY;
+	 read0 = writemask & BRW_WRITEMASK_XY;
 	 break;
 
       case WM_DELTAXY:
-	 read0 = writemask & WRITEMASK_XY;
-	 read1 = WRITEMASK_X;
+	 read0 = writemask & BRW_WRITEMASK_XY;
+	 read1 = BRW_WRITEMASK_X;
 	 break;
 
       case WM_PIXELW:
-	 read0 = WRITEMASK_X;
-	 read1 = WRITEMASK_XY;
+	 read0 = BRW_WRITEMASK_X;
+	 read1 = BRW_WRITEMASK_XY;
 	 break;
 
       case WM_LINTERP:
-	 read0 = WRITEMASK_X;
-	 read1 = WRITEMASK_XY;
+	 read0 = BRW_WRITEMASK_X;
+	 read1 = BRW_WRITEMASK_XY;
 	 break;
 
       case WM_PINTERP:
-	 read0 = WRITEMASK_X; /* interpolant */
-	 read1 = WRITEMASK_XY; /* deltas */
-	 read2 = WRITEMASK_W; /* pixel w */
+	 read0 = BRW_WRITEMASK_X; /* interpolant */
+	 read1 = BRW_WRITEMASK_XY; /* deltas */
+	 read2 = BRW_WRITEMASK_W; /* pixel w */
 	 break;
 
       case TGSI_OPCODE_DP3:	
-	 read0 = WRITEMASK_XYZ;
-	 read1 = WRITEMASK_XYZ;
+	 read0 = BRW_WRITEMASK_XYZ;
+	 read1 = BRW_WRITEMASK_XYZ;
 	 break;
 
       case TGSI_OPCODE_DPH:
-	 read0 = WRITEMASK_XYZ;
-	 read1 = WRITEMASK_XYZW;
+	 read0 = BRW_WRITEMASK_XYZ;
+	 read1 = BRW_WRITEMASK_XYZW;
 	 break;
 
       case TGSI_OPCODE_DP4:
-	 read0 = WRITEMASK_XYZW;
-	 read1 = WRITEMASK_XYZW;
+	 read0 = BRW_WRITEMASK_XYZW;
+	 read1 = BRW_WRITEMASK_XYZW;
 	 break;
 
       case TGSI_OPCODE_LIT: 
-	 read0 = WRITEMASK_XYW;
+	 read0 = BRW_WRITEMASK_XYW;
 	 break;
 
       case TGSI_OPCODE_DST:
diff --git a/src/gallium/drivers/i965/brw_wm_surface_state.c b/src/gallium/drivers/i965/brw_wm_surface_state.c
index 86dcb74b5be..5045c9b4a66 100644
--- a/src/gallium/drivers/i965/brw_wm_surface_state.c
+++ b/src/gallium/drivers/i965/brw_wm_surface_state.c
@@ -31,7 +31,7 @@
                    
 
 #include "intel_mipmap_tree.h"
-#include "intel_batchbuffer.h"
+#include "brw_batchbuffer.h"
 #include "intel_tex.h"
 #include "intel_fbo.h"
 
diff --git a/src/gallium/drivers/i965/intel_batchbuffer.h b/src/gallium/drivers/i965/intel_batchbuffer.h
deleted file mode 100644
index be04656aec7..00000000000
--- a/src/gallium/drivers/i965/intel_batchbuffer.h
+++ /dev/null
@@ -1,168 +0,0 @@
-#ifndef INTEL_BATCHBUFFER_H
-#define INTEL_BATCHBUFFER_H
-
-#include "intel_bufmgr.h"
-#include "intel_reg.h"
-
-#define BATCH_SZ 16384
-#define BATCH_RESERVED 16
-
-enum cliprect_mode {
-   /**
-    * Batchbuffer contents may be looped over per cliprect, but do not
-    * require it.
-    */
-   IGNORE_CLIPRECTS,
-   /**
-    * Batchbuffer contents require looping over per cliprect at batch submit
-    * time.
-    *
-    * This will be upgraded to NO_LOOP_CLIPRECTS when there's a single
-    * constant cliprect, as in DRI2 or FBO rendering.
-    */
-   LOOP_CLIPRECTS,
-   /**
-    * Batchbuffer contents contain drawing that should not be executed multiple
-    * times.
-    */
-   NO_LOOP_CLIPRECTS,
-   /**
-    * Batchbuffer contents contain drawing that already handles cliprects, such
-    * as 2D drawing to front/back/depth that doesn't respect DRAWING_RECTANGLE.
-    *
-    * Equivalent behavior to NO_LOOP_CLIPRECTS, but may not persist in batch
-    * outside of LOCK/UNLOCK.  This is upgraded to just NO_LOOP_CLIPRECTS when
-    * there's a constant cliprect, as in DRI2 or FBO rendering.
-    */
-   REFERENCES_CLIPRECTS
-};
-
-struct intel_batchbuffer
-{
-   struct intel_context *intel;
-
-   struct brw_winsys_buffer *buf;
-
-   GLubyte *buffer;
-
-   GLubyte *map;
-   GLubyte *ptr;
-
-   GLuint size;
-
-   /** Tracking of BEGIN_BATCH()/OUT_BATCH()/ADVANCE_BATCH() debugging */
-   struct {
-      GLuint total;
-      GLubyte *start_ptr;
-   } emit;
-
-   GLuint dirty_state;
-};
-
-struct intel_batchbuffer *intel_batchbuffer_alloc(struct intel_context
-                                                  *intel);
-
-void intel_batchbuffer_free(struct intel_batchbuffer *batch);
-
-
-void _intel_batchbuffer_flush(struct intel_batchbuffer *batch,
-			      const char *file, int line);
-
-#define intel_batchbuffer_flush(batch) \
-	_intel_batchbuffer_flush(batch, __FILE__, __LINE__)
-
-void intel_batchbuffer_reset(struct intel_batchbuffer *batch);
-
-
-/* Unlike bmBufferData, this currently requires the buffer be mapped.
- * Consider it a convenience function wrapping multple
- * intel_buffer_dword() calls.
- */
-void intel_batchbuffer_data(struct intel_batchbuffer *batch,
-                            const void *data, GLuint bytes,
-			    enum cliprect_mode cliprect_mode);
-
-void intel_batchbuffer_release_space(struct intel_batchbuffer *batch,
-                                     GLuint bytes);
-
-GLboolean intel_batchbuffer_emit_reloc(struct intel_batchbuffer *batch,
-                                       struct brw_winsys_buffer *buffer,
-				       uint32_t read_domains,
-				       uint32_t write_domain,
-				       uint32_t offset);
-
-/* Inline functions - might actually be better off with these
- * non-inlined.  Certainly better off switching all command packets to
- * be passed as structs rather than dwords, but that's a little bit of
- * work...
- */
-static INLINE GLint
-intel_batchbuffer_space(struct intel_batchbuffer *batch)
-{
-   return (batch->size - BATCH_RESERVED) - (batch->ptr - batch->map);
-}
-
-
-static INLINE void
-intel_batchbuffer_emit_dword(struct intel_batchbuffer *batch, GLuint dword)
-{
-   assert(batch->map);
-   assert(intel_batchbuffer_space(batch) >= 4);
-   *(GLuint *) (batch->ptr) = dword;
-   batch->ptr += 4;
-}
-
-static INLINE void
-intel_batchbuffer_require_space(struct intel_batchbuffer *batch,
-                                GLuint sz,
-				enum cliprect_mode cliprect_mode)
-{
-   assert(sz < batch->size - 8);
-   if (intel_batchbuffer_space(batch) < sz)
-      intel_batchbuffer_flush(batch);
-
-   /* All commands should be executed once regardless of cliprect
-    * mode.
-    */
-   (void)cliprect_mode;
-}
-
-/* Here are the crusty old macros, to be removed:
- */
-#define BATCH_LOCALS
-
-#define BEGIN_BATCH(n, cliprect_mode) do {				\
-   intel_batchbuffer_require_space(intel->batch, (n)*4, cliprect_mode); \
-   assert(intel->batch->emit.start_ptr == NULL);			\
-   intel->batch->emit.total = (n) * 4;					\
-   intel->batch->emit.start_ptr = intel->batch->ptr;			\
-} while (0)
-
-#define OUT_BATCH(d) intel_batchbuffer_emit_dword(intel->batch, d)
-
-#define OUT_RELOC(buf, read_domains, write_domain, delta) do {		\
-   assert((unsigned) (delta) < buf->size);				\
-   intel_batchbuffer_emit_reloc(intel->batch, buf,			\
-				read_domains, write_domain, delta);	\
-} while (0)
-
-#define ADVANCE_BATCH() do {						\
-   unsigned int _n = intel->batch->ptr - intel->batch->emit.start_ptr;	\
-   assert(intel->batch->emit.start_ptr != NULL);			\
-   if (_n != intel->batch->emit.total) {				\
-      fprintf(stderr, "ADVANCE_BATCH: %d of %d dwords emitted\n",	\
-	      _n, intel->batch->emit.total);				\
-      abort();								\
-   }									\
-   intel->batch->emit.start_ptr = NULL;					\
-} while(0)
-
-
-static INLINE void
-intel_batchbuffer_emit_mi_flush(struct intel_batchbuffer *batch)
-{
-   intel_batchbuffer_require_space(batch, 4, IGNORE_CLIPRECTS);
-   intel_batchbuffer_emit_dword(batch, MI_FLUSH);
-}
-
-#endif
-- 
cgit v1.2.3


From 09c231f84a20a306a173b60c82484ce1f9331edf Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Mon, 26 Oct 2009 00:20:33 +0000
Subject: i965g: still working on compilation

---
 src/gallium/auxiliary/tgsi/tgsi_scan.h          |   3 +
 src/gallium/drivers/i965/Makefile               |   9 +-
 src/gallium/drivers/i965/brw_batchbuffer.c      |  14 +-
 src/gallium/drivers/i965/brw_context.h          |  18 +-
 src/gallium/drivers/i965/brw_eu_emit.c          |   4 +-
 src/gallium/drivers/i965/brw_pipe_fb.c          |   2 +-
 src/gallium/drivers/i965/brw_pipe_flush.c       |   9 +-
 src/gallium/drivers/i965/brw_pipe_query.c       | 110 +++++++-----
 src/gallium/drivers/i965/brw_pipe_sampler.c     |  81 +++++++++
 src/gallium/drivers/i965/brw_screen_surface.c   | 156 ++++++++++++++---
 src/gallium/drivers/i965/brw_screen_texture.c   | 218 ++++++++++++++++++++++++
 src/gallium/drivers/i965/brw_sf.c               |  80 ++++-----
 src/gallium/drivers/i965/brw_sf.h               |  13 +-
 src/gallium/drivers/i965/brw_sf_emit.c          | 145 +++++++++-------
 src/gallium/drivers/i965/brw_sf_state.c         | 178 +++++++++----------
 src/gallium/drivers/i965/brw_state.h            |  13 +-
 src/gallium/drivers/i965/brw_state_batch.c      |   8 +-
 src/gallium/drivers/i965/brw_state_cache.c      |  64 ++++---
 src/gallium/drivers/i965/brw_state_debug.c      |  19 ++-
 src/gallium/drivers/i965/brw_state_dump.c       |  64 +++----
 src/gallium/drivers/i965/brw_state_upload.c     |  37 ++--
 src/gallium/drivers/i965/brw_tex.c              |  50 ------
 src/gallium/drivers/i965/brw_tex_layout.c       | 218 ------------------------
 src/gallium/drivers/i965/brw_urb.c              |  10 +-
 src/gallium/drivers/i965/brw_vs.h               |   2 +-
 src/gallium/drivers/i965/brw_vs_emit.c          |  20 +--
 src/gallium/drivers/i965/brw_vs_state.c         |   4 +-
 src/gallium/drivers/i965/brw_winsys.h           |  18 +-
 src/gallium/drivers/i965/brw_wm.c               |   4 +-
 src/gallium/drivers/i965/brw_wm.h               |  36 ++--
 src/gallium/drivers/i965/brw_wm_debug.c         |  68 ++++----
 src/gallium/drivers/i965/brw_wm_emit.c          |   8 +-
 src/gallium/drivers/i965/brw_wm_fp.c            |  18 +-
 src/gallium/drivers/i965/brw_wm_glsl.c          |  16 +-
 src/gallium/drivers/i965/brw_wm_pass0.c         |   6 +-
 src/gallium/drivers/i965/brw_wm_pass1.c         |   2 +-
 src/gallium/drivers/i965/brw_wm_pass2.c         |   4 +-
 src/gallium/drivers/i965/brw_wm_sampler_state.c | 170 ++++--------------
 src/gallium/drivers/i965/brw_wm_state.c         |   6 +-
 39 files changed, 1007 insertions(+), 898 deletions(-)
 create mode 100644 src/gallium/drivers/i965/brw_screen_texture.c
 delete mode 100644 src/gallium/drivers/i965/brw_tex.c
 delete mode 100644 src/gallium/drivers/i965/brw_tex_layout.c

(limited to 'src/gallium/drivers/i965/brw_vs.h')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index 8a7ee0c7e4f..6754001e885 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -61,6 +61,9 @@ struct tgsi_shader_info
    boolean uses_kill;  /**< KIL or KILP instruction used? */
    boolean uses_fogcoord; /**< fragment shader uses fog coord? */
    boolean uses_frontfacing; /**< fragment shader uses front/back-face flag? */
+
+   uint texture_max;
+   uint texture_mask;
 };
 
 
diff --git a/src/gallium/drivers/i965/Makefile b/src/gallium/drivers/i965/Makefile
index 40e8aa8786d..c3dbad72ae6 100644
--- a/src/gallium/drivers/i965/Makefile
+++ b/src/gallium/drivers/i965/Makefile
@@ -28,10 +28,7 @@ C_SOURCES = \
 	brw_pipe_blend.c \
 	brw_pipe_depth.c \
 	brw_pipe_fb.c \
-	brw_pipe_flush.c \
 	brw_pipe_query.c \
-	brw_pipe_shader.c \
-	brw_screen_surface.c \
 	brw_sf.c \
 	brw_sf_emit.c \
 	brw_sf_state.c \
@@ -40,8 +37,6 @@ C_SOURCES = \
 	brw_state_dump.c \
 	brw_state_upload.c \
 	brw_swtnl.c \
-	brw_tex.c \
-	brw_tex_layout.c \
 	brw_urb.c \
 	brw_util.c \
 	brw_vs.c \
@@ -60,8 +55,12 @@ C_SOURCES = \
 	brw_wm_sampler_state.c \
 	brw_wm_state.c \
 	brw_wm_surface_state.c \
+	brw_screen_surface.c \
+	brw_screen_texture.c \
 	brw_bo.c \
 	brw_batchbuffer.c \
+	brw_pipe_shader.c \
+	brw_pipe_flush.c \
 	intel_tex_layout.c 
 
 include ../../Makefile.template
diff --git a/src/gallium/drivers/i965/brw_batchbuffer.c b/src/gallium/drivers/i965/brw_batchbuffer.c
index 8bcac76ede1..45fbd59273a 100644
--- a/src/gallium/drivers/i965/brw_batchbuffer.c
+++ b/src/gallium/drivers/i965/brw_batchbuffer.c
@@ -105,13 +105,13 @@ _brw_batchbuffer_flush(struct brw_batchbuffer *batch, const char *file,
    }
 
 
-   if (INTEL_DEBUG & DEBUG_BATCH)
-      fprintf(stderr, "%s:%d: Batchbuffer flush with %db used\n", file, line,
+   if (BRW_DEBUG & DEBUG_BATCH)
+      debug_printf("%s:%d: Batchbuffer flush with %db used\n", file, line,
 	      used);
 
    /* Emit a flush if the bufmgr doesn't do it for us. */
    if (intel->always_flush_cache || !intel->ttm) {
-      *(GLuint *) (batch->ptr) = intel->vtbl.flush_cmd();
+      *(GLuint *) (batch->ptr) = ((CMD_MI_FLUSH << 16) | BRW_FLUSH_STATE_CACHE);
       batch->ptr += 4;
       used = batch->ptr - batch->map;
    }
@@ -136,15 +136,15 @@ _brw_batchbuffer_flush(struct brw_batchbuffer *batch, const char *file,
       
    batch->sws->bo_exec(batch->buf, used, NULL, 0, 0 );
       
-   if (INTEL_DEBUG & DEBUG_BATCH) {
+   if (BRW_DEBUG & DEBUG_BATCH) {
       dri_bo_map(batch->buf, GL_FALSE);
       intel_decode(batch->buf->virtual, used / 4, batch->buf->offset,
 		   brw->brw_screen->pci_id);
       dri_bo_unmap(batch->buf);
    }
 
-   if (INTEL_DEBUG & DEBUG_SYNC) {
-      fprintf(stderr, "waiting for idle\n");
+   if (BRW_DEBUG & DEBUG_SYNC) {
+      debug_printf("waiting for idle\n");
       dri_bo_map(batch->buf, GL_TRUE);
       dri_bo_unmap(batch->buf);
    }
@@ -166,7 +166,7 @@ brw_batchbuffer_emit_reloc(struct brw_batchbuffer *batch,
    int ret;
 
    if (batch->ptr - batch->map > batch->buf->size)
-      _mesa_printf ("bad relocation ptr %p map %p offset %d size %d\n",
+      debug_printf ("bad relocation ptr %p map %p offset %d size %d\n",
 		    batch->ptr, batch->map, batch->ptr - batch->map, batch->buf->size);
 
    ret = batch->sws->bo_emit_reloc(batch->buf,
diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
index df43d8ba4d5..10c1cf6f33a 100644
--- a/src/gallium/drivers/i965/brw_context.h
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -190,6 +190,8 @@ struct brw_fragment_shader {
 #define PIPE_NEW_FRAMEBUFFER_DIMENSIONS 0x10000
 #define PIPE_NEW_DEPTH_BUFFER           0x20000
 #define PIPE_NEW_COLOR_BUFFERS          0x40000
+#define PIPE_NEW_QUERY                  0x80000
+#define PIPE_NEW_SCISSOR                0x100000
 
 
@@ -204,7 +206,7 @@ struct brw_fragment_shader {
 #define BRW_NEW_WM_INPUT_DIMENSIONS     0x100
 #define BRW_NEW_PSP                     0x800
 #define BRW_NEW_WM_SURFACES		0x1000
-#define BRW_NEW_FENCE                   0x2000
+#define BRW_NEW_xxx                     0x2000 /* was FENCE */
 #define BRW_NEW_INDICES			0x4000
 #define BRW_NEW_VERTICES		0x8000
 /**
@@ -373,6 +375,7 @@ struct brw_cache_item {
 
 struct brw_cache {
    struct brw_context *brw;
+   struct brw_winsys_screen *sws;
 
    struct brw_cache_item **items;
    GLuint size, n_items;
@@ -380,6 +383,7 @@ struct brw_cache {
    GLuint key_size[BRW_MAX_CACHE];		/* for fixed-size keys */
    GLuint aux_size[BRW_MAX_CACHE];
    char *name[BRW_MAX_CACHE];
+   
 
    /* Record of the last BOs chosen for each cache_id.  Used to set
     * brw->state.dirty.cache when a new cache item is chosen.
@@ -448,7 +452,7 @@ struct brw_query_object {
    int last_index;
 
    /* Total count of pixels from previous BOs */
-   unsigned int count;
+   uint64_t result;
 };
 
 
@@ -477,11 +481,18 @@ struct brw_context
       const struct brw_rasterizer_state *rast;
       const struct brw_depth_stencil_state *zstencil;
 
+      const struct pipe_texture *texture[PIPE_MAX_SAMPLERS];
+      const struct pipe_sampler *sampler[PIPE_MAX_SAMPLERS];
+      unsigned num_textures;
+      unsigned num_samplers;
+      
+
       struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
       struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
       unsigned num_vertex_elements;
       unsigned num_vertex_buffers;
 
+      struct pipe_scissor_state scissor;
       struct pipe_framebuffer_state fb;
       struct pipe_viewport_state vp;
       struct pipe_clip_state ucp;
@@ -492,6 +503,8 @@ struct brw_context
       struct brw_blend_constant_color bcc;
       struct brw_polygon_stipple bps;
 
+      
+
       /**
        * Index buffer for this draw_prims call.
        *
@@ -688,6 +701,7 @@ struct brw_context
       struct brw_winsys_buffer *bo;
       int index;
       GLboolean active;
+      int stats_wm;
    } query;
 
    struct {
diff --git a/src/gallium/drivers/i965/brw_eu_emit.c b/src/gallium/drivers/i965/brw_eu_emit.c
index f6b8843e01e..f7fa5203488 100644
--- a/src/gallium/drivers/i965/brw_eu_emit.c
+++ b/src/gallium/drivers/i965/brw_eu_emit.c
@@ -1262,7 +1262,7 @@ void brw_SAMPLE(struct brw_compile *p,
    GLboolean need_stall = 0;
    
    if (writemask == 0) {
-      /*_mesa_printf("%s: zero writemask??\n", __FUNCTION__); */
+      /*debug_printf("%s: zero writemask??\n", __FUNCTION__); */
       return;
    }
    
@@ -1294,7 +1294,7 @@ void brw_SAMPLE(struct brw_compile *p,
 
       if (newmask != writemask) {
 	 need_stall = 1;
-         /* _mesa_printf("need stall %x %x\n", newmask , writemask); */
+         /* debug_printf("need stall %x %x\n", newmask , writemask); */
       }
       else {
 	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
diff --git a/src/gallium/drivers/i965/brw_pipe_fb.c b/src/gallium/drivers/i965/brw_pipe_fb.c
index 63917172271..c65f9bc3747 100644
--- a/src/gallium/drivers/i965/brw_pipe_fb.c
+++ b/src/gallium/drivers/i965/brw_pipe_fb.c
@@ -53,7 +53,7 @@ static void brw_set_viewport_state( struct pipe_context *pipe,
 void brw_pipe_framebuffer_init( struct brw_context *brw )
 {
    brw->base.set_framebuffer_state = brw_set_framebuffer_state;
-   brw->base.set_framebuffer_state = brw_set_framebuffer_state;
+   brw->base.set_viewport_state = brw_set_viewport_state;
 }
 
 void brw_pipe_framebuffer_cleanup( struct brw_context *brw )
diff --git a/src/gallium/drivers/i965/brw_pipe_flush.c b/src/gallium/drivers/i965/brw_pipe_flush.c
index 65e71515178..fb4a784de9d 100644
--- a/src/gallium/drivers/i965/brw_pipe_flush.c
+++ b/src/gallium/drivers/i965/brw_pipe_flush.c
@@ -52,14 +52,7 @@ static void brw_note_fence( struct brw_context *brw, GLuint fence )
  */
 static GLuint brw_flush_cmd( void )
 {
-   struct brw_mi_flush flush;
-
-   return ;
-
-   flush.opcode = CMD_MI_FLUSH;
-   flush.pad = 0;
-   flush.flags = BRW_FLUSH_STATE_CACHE;
-   return *(GLuint *)&flush;
+   return ((CMD_MI_FLUSH << 16) | BRW_FLUSH_STATE_CACHE);
 }
 
 
diff --git a/src/gallium/drivers/i965/brw_pipe_query.c b/src/gallium/drivers/i965/brw_pipe_query.c
index a2da1373bfd..18a9b71af07 100644
--- a/src/gallium/drivers/i965/brw_pipe_query.c
+++ b/src/gallium/drivers/i965/brw_pipe_query.c
@@ -46,25 +46,38 @@
 #include "brw_reg.h"
 
 /** Waits on the query object's BO and totals the results for this query */
-static void
-brw_queryobj_get_results(struct brw_query_object *query)
+static boolean
+brw_query_get_result(struct pipe_context *pipe,
+		     struct pipe_query *q,
+		     boolean wait,
+		     uint64_t *result)
 {
-   int i;
-   uint64_t *results;
-
-   if (query->bo == NULL)
-      return;
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_query_object *query = (struct brw_query_object *)q;
 
    /* Map and count the pixels from the current query BO */
-   dri_bo_map(query->bo, GL_FALSE);
-   results = query->bo->virtual;
-   for (i = query->first_index; i <= query->last_index; i++) {
-      query->Base.Result += results[i * 2 + 1] - results[i * 2];
+   if (query->bo) {
+      int i;
+      uint64_t *map;
+      
+      if (brw->sws->bo_is_busy(query->bo) && !wait)
+	 return FALSE;
+      
+      map = brw->sws->bo_map(query->bo, GL_FALSE);
+      if (map == NULL)
+	 return FALSE;
+      
+      for (i = query->first_index; i <= query->last_index; i++) {
+	 query->result += map[i * 2 + 1] - map[i * 2];
+      }
+
+      brw->sws->bo_unmap(query->bo);
+      brw->sws->bo_unreference(query->bo);
+      query->bo = NULL;
    }
-   dri_bo_unmap(query->bo);
 
-   brw->sws->bo_unreference(query->bo);
-   query->bo = NULL;
+   *result = query->result;
+   return TRUE;
 }
 
 static struct pipe_query *
@@ -72,12 +85,12 @@ brw_query_create(struct pipe_context *pipe, unsigned type )
 {
    struct brw_query_object *query;
 
-   switch (query->type) {
+   switch (type) {
    case PIPE_QUERY_OCCLUSION_COUNTER:
       query = CALLOC_STRUCT( brw_query_object );
       if (query == NULL)
 	 return NULL;
-      return &query->Base;
+      return (struct pipe_query *)query;
       
    default:
       return NULL;
@@ -87,6 +100,7 @@ brw_query_create(struct pipe_context *pipe, unsigned type )
 static void
 brw_query_destroy(struct pipe_context *pipe, struct pipe_query *q)
 {
+   struct brw_context *brw = brw_context(pipe);
    struct brw_query_object *query = (struct brw_query_object *)q;
 
    brw->sws->bo_unreference(query->bo);
@@ -94,24 +108,25 @@ brw_query_destroy(struct pipe_context *pipe, struct pipe_query *q)
 }
 
 static void
-brw_begin_query(struct pipe_context *pipe, struct pipe_query *q)
+brw_query_begin(struct pipe_context *pipe, struct pipe_query *q)
 {
    struct brw_context *brw = brw_context(pipe);
    struct brw_query_object *query = (struct brw_query_object *)q;
 
    /* Reset our driver's tracking of query state. */
    brw->sws->bo_unreference(query->bo);
+   query->result = 0;
    query->bo = NULL;
    query->first_index = -1;
    query->last_index = -1;
 
    insert_at_head(&brw->query.active_head, query);
-   brw->stats_wm++;
-   brw->dirty.mesa |= PIPE_NEW_QUERY;
+   brw->query.stats_wm++;
+   brw->state.dirty.mesa |= PIPE_NEW_QUERY;
 }
 
 static void
-brw_end_query(struct pipe_context *pipe, struct pipe_query *q)
+brw_query_end(struct pipe_context *pipe, struct pipe_query *q)
 {
    struct brw_context *brw = brw_context(pipe);
    struct brw_query_object *query = (struct brw_query_object *)q;
@@ -129,27 +144,13 @@ brw_end_query(struct pipe_context *pipe, struct pipe_query *q)
    }
 
    remove_from_list(query);
-   brw->stats_wm--;
-   brw->dirty.mesa |= PIPE_NEW_QUERY;
+   brw->query.stats_wm--;
+   brw->state.dirty.mesa |= PIPE_NEW_QUERY;
 }
 
-static void brw_wait_query(struct pipe_context *pipe, struct pipe_query *q)
-{
-   struct brw_query_object *query = (struct brw_query_object *)q;
-
-   brw_queryobj_get_results(query);
-   query->Base.Ready = GL_TRUE;
-}
-
-static void brw_check_query(struct pipe_context *pipe, struct pipe_query *q)
-{
-   struct brw_query_object *query = (struct brw_query_object *)q;
-
-   if (query->bo == NULL || !drm_intel_bo_busy(query->bo)) {
-      brw_queryobj_get_results(query);
-      query->Base.Ready = GL_TRUE;
-   }
-}
+/***********************************************************************
+ * Internal functions and callbacks to implement queries 
+ */
 
 /** Called to set up the query BO and account for its aperture space */
 void
@@ -201,8 +202,17 @@ brw_emit_query_begin(struct brw_context *brw)
 
    foreach(query, &brw->query.active_head) {
       if (query->bo != brw->query.bo) {
+	 uint64_t tmp;
+	 
+	 /* Propogate the results from this buffer to all of the
+	  * active queries, as the bo is going away.
+	  */
 	 if (query->bo != NULL)
-	    brw_queryobj_get_results(query);
+	    brw_query_get_result( &brw->base, 
+				  (struct pipe_query *)query,
+				  FALSE,
+				  &tmp );
+
 	 brw->sws->bo_reference(brw->query.bo);
 	 query->bo = brw->query.bo;
 	 query->first_index = brw->query.index;
@@ -235,12 +245,18 @@ brw_emit_query_end(struct brw_context *brw)
    brw->query.index++;
 }
 
-void brw_init_queryobj_functions(struct dd_function_table *functions)
+void brw_pipe_query_init( struct brw_context *brw )
 {
-   functions->NewQueryObject = brw_new_query_object;
-   functions->DeleteQuery = brw_delete_query;
-   functions->BeginQuery = brw_begin_query;
-   functions->EndQuery = brw_end_query;
-   functions->CheckQuery = brw_check_query;
-   functions->WaitQuery = brw_wait_query;
+   brw->base.create_query = brw_query_create;
+   brw->base.destroy_query = brw_query_destroy;
+   brw->base.begin_query = brw_query_begin;
+   brw->base.end_query = brw_query_end;
+   brw->base.get_query_result = brw_query_get_result;
+}
+
+
+void brw_pipe_query_cleanup( struct brw_context *brw )
+{
+   /* Unreference brw->query.bo ??
+    */
 }
diff --git a/src/gallium/drivers/i965/brw_pipe_sampler.c b/src/gallium/drivers/i965/brw_pipe_sampler.c
index b3069f08c00..bc20eef6fb2 100644
--- a/src/gallium/drivers/i965/brw_pipe_sampler.c
+++ b/src/gallium/drivers/i965/brw_pipe_sampler.c
@@ -14,6 +14,87 @@ static void *brw_create_sampler_state( struct pipe_context *pipe,
 {
    struct brw_sampler_state *sampler = CALLOC_STRUCT(brw_sampler_state);
 
+   switch (key->minfilter) {
+   case GL_NEAREST:
+      sampler->ss0.min_filter = BRW_MAPFILTER_NEAREST;
+      sampler->ss0.mip_filter = BRW_MIPFILTER_NONE;
+      break;
+   case GL_LINEAR:
+      sampler->ss0.min_filter = BRW_MAPFILTER_LINEAR;
+      sampler->ss0.mip_filter = BRW_MIPFILTER_NONE;
+      break;
+   case GL_NEAREST_MIPMAP_NEAREST:
+      sampler->ss0.min_filter = BRW_MAPFILTER_NEAREST;
+      sampler->ss0.mip_filter = BRW_MIPFILTER_NEAREST;
+      break;
+   case GL_LINEAR_MIPMAP_NEAREST:
+      sampler->ss0.min_filter = BRW_MAPFILTER_LINEAR;
+      sampler->ss0.mip_filter = BRW_MIPFILTER_NEAREST;
+      break;
+   case GL_NEAREST_MIPMAP_LINEAR:
+      sampler->ss0.min_filter = BRW_MAPFILTER_NEAREST;
+      sampler->ss0.mip_filter = BRW_MIPFILTER_LINEAR;
+      break;
+   case GL_LINEAR_MIPMAP_LINEAR:
+      sampler->ss0.min_filter = BRW_MAPFILTER_LINEAR;
+      sampler->ss0.mip_filter = BRW_MIPFILTER_LINEAR;
+      break;
+   default:
+      break;
+   }
+
+   /* Set Anisotropy: 
+    */
+   if (key->max_aniso > 1.0) {
+      sampler->ss0.min_filter = BRW_MAPFILTER_ANISOTROPIC; 
+      sampler->ss0.mag_filter = BRW_MAPFILTER_ANISOTROPIC;
+
+      if (key->max_aniso > 2.0) {
+	 sampler->ss3.max_aniso = MIN2((key->max_aniso - 2) / 2,
+				       BRW_ANISORATIO_16);
+      }
+   }
+   else {
+      switch (key->magfilter) {
+      case GL_NEAREST:
+	 sampler->ss0.mag_filter = BRW_MAPFILTER_NEAREST;
+	 break;
+      case GL_LINEAR:
+	 sampler->ss0.mag_filter = BRW_MAPFILTER_LINEAR;
+	 break;
+      default:
+	 break;
+      }
+   }
+
+   sampler->ss1.r_wrap_mode = translate_wrap_mode(key->wrap_r);
+   sampler->ss1.s_wrap_mode = translate_wrap_mode(key->wrap_s);
+   sampler->ss1.t_wrap_mode = translate_wrap_mode(key->wrap_t);
+
+   /* Set LOD bias: 
+    */
+   sampler->ss0.lod_bias = S_FIXED(CLAMP(key->lod_bias, -16, 15), 6);
+
+   sampler->ss0.lod_preclamp = 1; /* OpenGL mode */
+   sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */
+
+   /* Set shadow function: 
+    */
+   if (key->comparemode == GL_COMPARE_R_TO_TEXTURE_ARB) {
+      /* Shadowing is "enabled" by emitting a particular sampler
+       * message (sample_c).  So need to recompile WM program when
+       * shadow comparison is enabled on each/any texture unit.
+       */
+      sampler->ss0.shadow_function =
+	 intel_translate_shadow_compare_func(key->comparefunc);
+   }
+
+   /* Set BaseMipLevel, MaxLOD, MinLOD: 
+    */
+   sampler->ss0.base_level = U_FIXED(0, 1);
+
+   sampler->ss1.max_lod = U_FIXED(MIN2(MAX2(key->maxlod, 0), 13), 6);
+   sampler->ss1.min_lod = U_FIXED(MIN2(MAX2(key->minlod, 0), 13), 6);
 
    return (void *)sampler;
 }
diff --git a/src/gallium/drivers/i965/brw_screen_surface.c b/src/gallium/drivers/i965/brw_screen_surface.c
index 544be6a0896..e0df6cc629b 100644
--- a/src/gallium/drivers/i965/brw_screen_surface.c
+++ b/src/gallium/drivers/i965/brw_screen_surface.c
@@ -1,27 +1,131 @@
-   /* _NEW_BUFFERS */
-   if (IS_965(brw->brw_screen->pci_id) &&
-       !IS_G4X(brw->brw_screen->pci_id)) {
-      for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
-	 struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
-	 struct intel_renderbuffer *irb = intel_renderbuffer(rb);
-
-	 /* The original gen4 hardware couldn't set up WM surfaces pointing
-	  * at an offset within a tile, which can happen when rendering to
-	  * anything but the base level of a texture or the +X face/0 depth.
-	  * This was fixed with the 4 Series hardware.
-	  *
-	  * For these original chips, you would have to make the depth and
-	  * color destination surfaces include information on the texture
-	  * type, LOD, face, and various limits to use them as a destination.
-	  * I would have done this, but there's also a nasty requirement that
-	  * the depth and the color surfaces all be of the same LOD, which
-	  * may be a worse requirement than this alignment.  (Also, we may
-	  * want to just demote the texture to untiled, instead).
-	  */
-	 if (irb->region && 
-	     irb->region->tiling != I915_TILING_NONE &&
-	     (irb->region->draw_offset & 4095)) {
-	    DBG("FALLBACK: non-tile-aligned destination for tiled FBO\n");
-	    return GL_TRUE;
-	 }
+
+#include "pipe/p_screen.h"
+#include "brw_screen.h"
+
+struct brw_surface_id {
+   unsigned face:3;
+   unsigned zslice:13;
+   unsigned level:16;
+};
+
+static boolean need_linear_view( struct brw_screen *brw_screen,
+				 struct brw_texture *brw_texture,
+				 unsigned face,
+				 unsigned level,
+				 unsigned zslice )
+{
+#if 0
+   /* XXX: what about IDGNG?
+    */
+   if (!BRW_IS_G4X(brw->brw_screen->pci_id))
+   {
+      struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+      struct intel_renderbuffer *irb = intel_renderbuffer(rb);
+
+      /* The original gen4 hardware couldn't set up WM surfaces pointing
+       * at an offset within a tile, which can happen when rendering to
+       * anything but the base level of a texture or the +X face/0 depth.
+       * This was fixed with the 4 Series hardware.
+       *
+       * For these original chips, you would have to make the depth and
+       * color destination surfaces include information on the texture
+       * type, LOD, face, and various limits to use them as a destination.
+       *
+       * This is easy in Gallium as surfaces are all backed by
+       * textures, but there's also a nasty requirement that the depth
+       * and the color surfaces all be of the same LOD, which is
+       * harder to get around as we can't look at a surface in
+       * isolation and decide if it's legal.
+       *
+       * Instead, end up being pessimistic and say that for i965,
+       * ... ??
+       */
+      if (brw_tex->tiling != I915_TILING_NONE &&
+	  (brw_tex_image_offset(brw_tex, face, level, zslize) & 4095)) {
+	 if (BRW_DEBUG & DEBUG_VIEW)
+	    debug_printf("%s: need surface view for non-aligned tex image\n",
+			 __FUNCTION__);
+	 return GL_TRUE;
       }
+   }
+#endif
+
+   /* Tiled 3d textures don't have subsets that look like 2d surfaces:
+    */
+   
+   /* Everything else should be fine to render to in-place:
+    */
+   return GL_FALSE;
+}
+
+/* Look at all texture views and figure out if any of them need to be
+ * back-copied into the texture for sampling
+ */
+void brw_update_texture( struct pipe_screen *screen,
+			 struct pipe_texture *texture )
+{
+   /* currently nothing to do */
+}
+
+
+static struct pipe_surface *create_linear_view( struct brw_screen *brw_screen,
+						struct brw_texture *brw_tex,
+						struct brw_surface_id id )
+{
+   
+}
+
+static struct pipe_surface *create_in_place_view( struct brw_screen *brw_screen,
+						  struct brw_texture *brw_tex,
+						  struct brw_surface_id id )
+{
+   struct brw_surface *surface = CALLOC_STRUCT(brw_surface);
+   surface->id = id;
+   
+}
+
+/* Get a surface which is view into a texture 
+ */
+struct pipe_surface *brw_get_tex_surface(struct pipe_screen *screen,
+					 struct pipe_texture *texture,
+					 unsigned face, unsigned level,
+					 unsigned zslice,
+					 unsigned usage )
+{
+   struct brw_screen *bscreen = brw_screen(screen);
+   struct brw_surface_id id;
+
+   id.face = face;
+   id.level = level;
+   id.zslice = zslice;
+
+   if (need_linear_view(brw_screen, brw_tex, id)) 
+      type = BRW_VIEW_LINEAR;
+   else
+      type = BRW_VIEW_IN_PLACE;
+
+   
+   foreach (surface, texture->views[type]) {
+      if (id.value == surface->id.value)
+	 return surface;
+   }
+
+   switch (type) {
+   case BRW_VIEW_LINEAR:
+      surface = create_linear_view( texture, id, type );
+      break;
+   case BRW_VIEW_IN_PLACE:
+      surface = create_in_place_view( texture, id, type );
+      break;
+   default:
+      return NULL;
+   }
+
+   insert_at_head( texture->views[type], surface );
+   return surface;
+}
+
+
+void brw_tex_surface_destroy( struct pipe_surface *surface )
+{
+}
diff --git a/src/gallium/drivers/i965/brw_screen_texture.c b/src/gallium/drivers/i965/brw_screen_texture.c
new file mode 100644
index 00000000000..50c30878c6f
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_screen_texture.c
@@ -0,0 +1,218 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+/* Code to layout images in a mipmap tree for i965.
+ */
+
+#include "brw_tex_layout.h"
+
+#define FILE_DEBUG_FLAG DEBUG_MIPTREE
+
+GLboolean brw_miptree_layout(struct brw_context *brw,
+			     struct intel_mipmap_tree *mt,
+			     uint32_t tiling)
+{
+   /* XXX: these vary depending on image format: */
+   /* GLint align_w = 4; */
+
+   switch (mt->target) {
+   case GL_TEXTURE_CUBE_MAP:
+      if (IS_IGDNG(brw->brw_screen->pci_id)) {
+          GLuint align_h = 2, align_w = 4;
+          GLuint level;
+          GLuint x = 0;
+          GLuint y = 0;
+          GLuint width = mt->width0;
+          GLuint height = mt->height0;
+          GLuint qpitch = 0;
+          GLuint y_pitch = 0;
+
+          mt->pitch = mt->width0;
+          intel_get_texture_alignment_unit(mt->internal_format, &align_w, &align_h);
+          y_pitch = ALIGN(height, align_h);
+
+          if (mt->compressed) {
+              mt->pitch = ALIGN(mt->width0, align_w);
+          }
+
+          if (mt->last_level != 0) {
+              GLuint mip1_width;
+
+              if (mt->compressed) {
+                  mip1_width = ALIGN(minify(mt->width0), align_w)
+                      + ALIGN(minify(minify(mt->width0)), align_w);
+              } else {
+                  mip1_width = ALIGN(minify(mt->width0), align_w)
+                      + minify(minify(mt->width0));
+              }
+
+              if (mip1_width > mt->pitch) {
+                  mt->pitch = mip1_width;
+              }
+          }
+
+          mt->pitch = intel_miptree_pitch_align(intel, mt, tiling, mt->pitch);
+
+          if (mt->compressed) {
+              qpitch = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) / 4 * mt->pitch * mt->cpp;
+              mt->total_height = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) / 4 * 6;
+          } else {
+              qpitch = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) * mt->pitch * mt->cpp;
+              mt->total_height = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) * 6;
+          }
+
+          for (level = 0; level <= mt->last_level; level++) {
+              GLuint img_height;
+              GLuint nr_images = 6;
+              GLuint q = 0;
+
+              intel_miptree_set_level_info(mt, level, nr_images, x, y, width, 
+                                           height, 1);
+
+              for (q = 0; q < nr_images; q++)
+                  intel_miptree_set_image_offset_ex(mt, level, q, x, y, q * qpitch);
+
+              if (mt->compressed)
+                  img_height = MAX2(1, height/4);
+              else
+                  img_height = ALIGN(height, align_h);
+
+              if (level == 1) {
+                  x += ALIGN(width, align_w);
+              }
+              else {
+                  y += img_height;
+              }
+
+              width  = minify(width);
+              height = minify(height);
+          }
+
+          break;
+      }
+
+   case GL_TEXTURE_3D: {
+      GLuint width  = mt->width0;
+      GLuint height = mt->height0;
+      GLuint depth = mt->depth0;
+      GLuint pack_x_pitch, pack_x_nr;
+      GLuint pack_y_pitch;
+      GLuint level;
+      GLuint align_h = 2;
+      GLuint align_w = 4;
+
+      mt->total_height = 0;
+      intel_get_texture_alignment_unit(mt->internal_format, &align_w, &align_h);
+
+      if (mt->compressed) {
+          mt->pitch = ALIGN(width, align_w);
+          pack_y_pitch = (height + 3) / 4;
+      } else {
+	 mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->width0);
+	 pack_y_pitch = ALIGN(mt->height0, align_h);
+      }
+
+      pack_x_pitch = width;
+      pack_x_nr = 1;
+
+      for (level = 0 ; level <= mt->last_level ; level++) {
+	 GLuint nr_images = mt->target == GL_TEXTURE_3D ? depth : 6;
+	 GLint x = 0;
+	 GLint y = 0;
+	 GLint q, j;
+
+	 intel_miptree_set_level_info(mt, level, nr_images,
+				      0, mt->total_height,
+				      width, height, depth);
+
+	 for (q = 0; q < nr_images;) {
+	    for (j = 0; j < pack_x_nr && q < nr_images; j++, q++) {
+	       intel_miptree_set_image_offset(mt, level, q, x, y);
+	       x += pack_x_pitch;
+	    }
+
+	    x = 0;
+	    y += pack_y_pitch;
+	 }
+
+
+	 mt->total_height += y;
+	 width  = minify(width);
+	 height = minify(height);
+	 depth  = minify(depth);
+
+	 if (mt->compressed) {
+	    pack_y_pitch = (height + 3) / 4;
+
+	    if (pack_x_pitch > ALIGN(width, align_w)) {
+	       pack_x_pitch = ALIGN(width, align_w);
+	       pack_x_nr <<= 1;
+	    }
+	 } else {
+	    if (pack_x_pitch > 4) {
+	       pack_x_pitch >>= 1;
+	       pack_x_nr <<= 1;
+	       assert(pack_x_pitch * pack_x_nr <= mt->pitch);
+	    }
+
+	    if (pack_y_pitch > 2) {
+	       pack_y_pitch >>= 1;
+	       pack_y_pitch = ALIGN(pack_y_pitch, align_h);
+	    }
+	 }
+
+      }
+      /* The 965's sampler lays cachelines out according to how accesses
+       * in the texture surfaces run, so they may be "vertical" through
+       * memory.  As a result, the docs say in Surface Padding Requirements:
+       * Sampling Engine Surfaces that two extra rows of padding are required.
+       * We don't know of similar requirements for pre-965, but given that
+       * those docs are silent on padding requirements in general, let's play
+       * it safe.
+       */
+      if (mt->target == GL_TEXTURE_CUBE_MAP)
+	 mt->total_height += 2;
+      break;
+   }
+
+   default:
+      i945_miptree_layout_2d(intel, mt, tiling);
+      break;
+   }
+   DBG("%s: %dx%dx%d - sz 0x%x\n", __FUNCTION__,
+		mt->pitch,
+		mt->total_height,
+		mt->cpp,
+		mt->pitch * mt->total_height * mt->cpp );
+
+   return GL_TRUE;
+}
+
diff --git a/src/gallium/drivers/i965/brw_sf.c b/src/gallium/drivers/i965/brw_sf.c
index 1b73b3fd51c..013d839e37d 100644
--- a/src/gallium/drivers/i965/brw_sf.c
+++ b/src/gallium/drivers/i965/brw_sf.c
@@ -29,11 +29,12 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
   
+#include "pipe/p_state.h"
 
 #include "brw_batchbuffer.h"
-
 #include "brw_defines.h"
 #include "brw_context.h"
+#include "brw_pipe_rast.h"
 #include "brw_eu.h"
 #include "brw_util.h"
 #include "brw_sf.h"
@@ -45,7 +46,6 @@ static void compile_sf_prog( struct brw_context *brw,
    struct brw_sf_compile c;
    const GLuint *program;
    GLuint program_size;
-   GLuint i, idx;
 
    memset(&c, 0, sizeof(c));
 
@@ -54,7 +54,7 @@ static void compile_sf_prog( struct brw_context *brw,
    brw_init_compile(brw, &c.func);
 
    c.key = *key;
-   c.nr_attrs = util_count_bits(c.key.attrs);
+   c.nr_attrs = c.key.nr_attrs;
    c.nr_attr_regs = (c.nr_attrs+1)/2;
    c.nr_setup_attrs = c.key.nr_attrs;
    c.nr_setup_regs = (c.nr_setup_attrs+1)/2;
@@ -62,21 +62,6 @@ static void compile_sf_prog( struct brw_context *brw,
    c.prog_data.urb_read_length = c.nr_attr_regs;
    c.prog_data.urb_entry_size = c.nr_setup_regs * 2;
 
-   /* Construct map from attribute number to position in the vertex.
-    */
-   for (i = idx = 0; i < VERT_RESULT_MAX; i++) 
-      if (c.key.attrs & (1<<i)) {
-	 c.attr_to_idx[i] = idx;
-	 c.idx_to_attr[idx] = i;
-	 if (i >= VERT_RESULT_TEX0 && i <= VERT_RESULT_TEX7) {
-            c.point_attrs[i].CoordReplace = 
-               ctx->Point.CoordReplace[i - VERT_RESULT_TEX0];
-	 }
-         else {
-            c.point_attrs[i].CoordReplace = GL_FALSE;
-         }
-	 idx++;
-      }
    
    /* Which primitive?  Or all three? 
     */
@@ -122,7 +107,7 @@ static void compile_sf_prog( struct brw_context *brw,
 
 /* Calculate interpolants for triangle and line rasterization.
  */
-static void upload_sf_prog(struct brw_context *brw)
+static int upload_sf_prog(struct brw_context *brw)
 {
    struct brw_sf_prog_key key;
 
@@ -131,46 +116,49 @@ static void upload_sf_prog(struct brw_context *brw)
    /* Populate the key, noting state dependencies:
     */
    /* CACHE_NEW_VS_PROG */
-   key.attrs = brw->vs.prog_data->nr_outputs_written; 
+   key.nr_attrs = brw->curr.vertex_shader->info.file_max[TGSI_FILE_OUTPUT] + 1;
+
+
+   /* XXX: this is probably where the mapping between vertex shader
+    * outputs and fragment shader inputs should be handled.  Assume
+    * for now 1:1 correspondance.
+    *
+    * XXX: scan frag shader inputs to work out linear vs. perspective
+    * interpolation below.
+    *
+    * XXX: as long as we're hard-wiring, is eg. position required to
+    * be linear?
+    */
+   key.linear_attrs = 0;
+   key.persp_attrs = (1 << key.nr_attrs) - 1;
 
    /* BRW_NEW_REDUCED_PRIMITIVE */
    switch (brw->reduced_primitive) {
-   case GL_TRIANGLES: 
-      /* NOTE: We just use the edgeflag attribute as an indicator that
-       * unfilled triangles are active.  We don't actually do the
-       * edgeflag testing here, it is already done in the clip
-       * program.
+   case PIPE_PRIM_TRIANGLES: 
+      /* PIPE_NEW_RAST
        */
-      if (key.attrs & (1<<VERT_RESULT_EDGE))
+      if (brw->curr.rast->templ.fill_cw != PIPE_POLYGON_MODE_FILL ||
+	  brw->curr.rast->templ.fill_ccw != PIPE_POLYGON_MODE_FILL)
 	 key.primitive = SF_UNFILLED_TRIS;
       else
 	 key.primitive = SF_TRIANGLES;
       break;
-   case GL_LINES: 
+   case PIPE_PRIM_LINES: 
       key.primitive = SF_LINES; 
       break;
-   case GL_POINTS: 
+   case PIPE_PRIM_POINTS: 
       key.primitive = SF_POINTS; 
       break;
    }
 
-   key.do_point_sprite = ctx->Point.PointSprite;
-   key.SpriteOrigin = ctx->Point.SpriteOrigin;
-   /* _NEW_LIGHT */
-   key.do_flat_shading = (ctx->Light.ShadeModel == GL_FLAT);
-   key.do_twoside_color = (ctx->Light.Enabled && ctx->Light.Model.TwoSide);
+   key.do_point_sprite = brw->curr.rast->templ.point_sprite;
+   key.sprite_origin_lower_left = 0; /* XXX: ctx->Point.SpriteOrigin - fix rast state */
+   key.do_flat_shading = brw->curr.rast->templ.flatshade;
+   key.do_twoside_color = brw->curr.rast->templ.light_twoside;
 
-   /* _NEW_HINT */
-   key.linear_color = 0;
-
-   /* _NEW_POLYGON */
    if (key.do_twoside_color) {
-      /* If we're rendering to a FBO, we have to invert the polygon
-       * face orientation, just as we invert the viewport in
-       * sf_unit_create_from_key().  ctx->DrawBuffer->Name will be
-       * nonzero if we're rendering to such an FBO.
-       */
-      key.frontface_ccw = (ctx->Polygon.FrontFace == GL_CCW) ^ (ctx->DrawBuffer->Name != 0);
+      key.frontface_ccw = (brw->curr.rast->templ.front_winding == 
+			   PIPE_WINDING_CCW);
    }
 
    brw->sws->bo_unreference(brw->sf.prog_bo);
@@ -180,14 +168,16 @@ static void upload_sf_prog(struct brw_context *brw)
 				      &brw->sf.prog_data);
    if (brw->sf.prog_bo == NULL)
       compile_sf_prog( brw, &key );
+
+   return 0;
 }
 
 
 const struct brw_tracked_state brw_sf_prog = {
    .dirty = {
-      .mesa  = (_NEW_HINT | _NEW_LIGHT | _NEW_POLYGON | _NEW_POINT),
+      .mesa  = (PIPE_NEW_RAST | PIPE_NEW_VERTEX_SHADER),
       .brw   = (BRW_NEW_REDUCED_PRIMITIVE),
-      .cache = CACHE_NEW_VS_PROG
+      .cache = 0
    },
    .prepare = upload_sf_prog
 };
diff --git a/src/gallium/drivers/i965/brw_sf.h b/src/gallium/drivers/i965/brw_sf.h
index c99116b8b1b..0b7003dc5ef 100644
--- a/src/gallium/drivers/i965/brw_sf.h
+++ b/src/gallium/drivers/i965/brw_sf.h
@@ -49,14 +49,21 @@ struct brw_sf_prog_key {
     */
    GLuint persp_attrs:32;
    GLuint linear_attrs:32;
+   GLuint point_coord_replace_attrs:32;
 
+   GLuint nr_attrs:8;
    GLuint primitive:2;
    GLuint do_twoside_color:1;
    GLuint do_flat_shading:1;
    GLuint frontface_ccw:1;
    GLuint do_point_sprite:1;
    GLuint sprite_origin_lower_left:1;
-   GLuint pad:25;
+   GLuint pad:17;
+
+   GLuint attr_col0:8;
+   GLuint attr_col1:8;
+   GLuint attr_bfc0:8;
+   GLuint attr_bfc1:8;
 };
 
 struct brw_sf_point_tex {
@@ -101,9 +108,7 @@ struct brw_sf_compile {
    GLuint nr_setup_attrs;
    GLuint nr_setup_regs;
 
-   GLubyte attr_to_idx[VERT_RESULT_MAX];   
-   GLubyte idx_to_attr[VERT_RESULT_MAX];   
-   struct brw_sf_point_tex point_attrs[VERT_RESULT_MAX];
+   GLuint point_coord_replace_mask;
 };
 
  
diff --git a/src/gallium/drivers/i965/brw_sf_emit.c b/src/gallium/drivers/i965/brw_sf_emit.c
index 4acb2b7d72e..db52c9553e7 100644
--- a/src/gallium/drivers/i965/brw_sf_emit.c
+++ b/src/gallium/drivers/i965/brw_sf_emit.c
@@ -43,17 +43,12 @@ static struct brw_reg get_vert_attr(struct brw_sf_compile *c,
 				    struct brw_reg vert,
 				    GLuint attr)
 {
-   GLuint off = c->attr_to_idx[attr] / 2;
-   GLuint sub = c->attr_to_idx[attr] % 2;
+   GLuint off = attr / 2;
+   GLuint sub = attr % 2;
 
    return brw_vec4_grf(vert.nr + off, sub * 4);
 }
 
-static GLboolean have_attr(struct brw_sf_compile *c,
-			   GLuint attr)
-{
-   return (c->key.attrs & (1<<attr)) ? 1 : 0;
-}
 
 /*********************************************************************** 
  * Twoside lighting
@@ -62,15 +57,16 @@ static void copy_bfc( struct brw_sf_compile *c,
 		      struct brw_reg vert )
 {
    struct brw_compile *p = &c->func;
-   GLuint i;
 
-   for (i = 0; i < 2; i++) {
-      if (have_attr(c, VERT_RESULT_COL0+i) &&
-	  have_attr(c, VERT_RESULT_BFC0+i))
-	 brw_MOV(p, 
-		 get_vert_attr(c, vert, VERT_RESULT_COL0+i), 
-		 get_vert_attr(c, vert, VERT_RESULT_BFC0+i));
-   }
+   if (c->key.attr_col0 && c->key.attr_bfc0)
+      brw_MOV(p, 
+	      get_vert_attr(c, vert, c->key.attr_col0), 
+	      get_vert_attr(c, vert, c->key.attr_bfc0));
+
+   if (c->key.attr_col1 && c->key.attr_bfc1)
+      brw_MOV(p, 
+	      get_vert_attr(c, vert, c->key.attr_col1), 
+	      get_vert_attr(c, vert, c->key.attr_bfc1));
 }
 
 
@@ -89,8 +85,8 @@ static void do_twoside_color( struct brw_sf_compile *c )
     * for user-supplied vertex programs, as t_vp_build.c always does
     * the right thing.
     */
-   if (!(have_attr(c, VERT_RESULT_COL0) && have_attr(c, VERT_RESULT_BFC0)) &&
-       !(have_attr(c, VERT_RESULT_COL1) && have_attr(c, VERT_RESULT_BFC1)))
+   if (!(c->key.attr_col0 && c->key.attr_bfc0) &&
+       !(c->key.attr_col1 && c->key.attr_bfc1))
       return;
    
    /* Need to use BRW_EXECUTE_4 and also do an 4-wide compare in order
@@ -126,14 +122,17 @@ static void copy_colors( struct brw_sf_compile *c,
 		     struct brw_reg src)
 {
    struct brw_compile *p = &c->func;
-   GLuint i;
 
-   for (i = VERT_RESULT_COL0; i <= VERT_RESULT_COL1; i++) {
-      if (have_attr(c,i))
-	 brw_MOV(p, 
-		 get_vert_attr(c, dst, i), 
-		 get_vert_attr(c, src, i));
-   }
+   if (c->key.attr_col0)
+      brw_MOV(p, 
+	      get_vert_attr(c, dst, c->key.attr_col0), 
+	      get_vert_attr(c, src, c->key.attr_col0));
+
+   if (c->key.attr_col1)
+      brw_MOV(p, 
+	      get_vert_attr(c, dst, c->key.attr_col1), 
+	      get_vert_attr(c, src, c->key.attr_col1));
+
 }
 
 
@@ -146,10 +145,16 @@ static void do_flatshade_triangle( struct brw_sf_compile *c )
 {
    struct brw_compile *p = &c->func;
    struct brw_reg ip = brw_ip_reg();
-   GLuint nr = util_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
    GLuint jmpi = 1;
+   GLuint nr = 0;
 
-   if (!nr)
+   if (c->key.attr_col0)
+      nr++;
+
+   if (c->key.attr_col1)
+      nr++;
+
+   if (nr == 0)
       return;
 
    /* Already done in clip program:
@@ -184,10 +189,16 @@ static void do_flatshade_line( struct brw_sf_compile *c )
 {
    struct brw_compile *p = &c->func;
    struct brw_reg ip = brw_ip_reg();
-   GLuint nr = util_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
    GLuint jmpi = 1;
+   GLuint nr = 0;
+
+   if (c->key.attr_col0)
+      nr++;
+
+   if (c->key.attr_col1)
+      nr++;
 
-   if (!nr)
+   if (nr == 0)
       return;
 
    /* Already done in clip program: 
@@ -319,10 +330,10 @@ static GLboolean calculate_masks( struct brw_sf_compile *c,
    *pc_linear = 0;
    *pc = 0xf;
       
-   if (persp_mask & (1 << c->idx_to_attr[reg*2])) 
+   if (persp_mask & (1 << (reg*2))) 
       *pc_persp = 0xf;
 
-   if (linear_mask & (1 << c->idx_to_attr[reg*2])) 
+   if (linear_mask & (1 << (reg*2))) 
       *pc_linear = 0xf;
 
    /* Maybe only processs one attribute on the final round:
@@ -330,10 +341,10 @@ static GLboolean calculate_masks( struct brw_sf_compile *c,
    if (reg*2+1 < c->nr_setup_attrs) {
       *pc |= 0xf0;
 
-      if (persp_mask & (1 << c->idx_to_attr[reg*2+1])) 
+      if (persp_mask & (1 << (reg*2+1))) 
 	 *pc_persp |= 0xf0;
 
-      if (linear_mask & (1 << c->idx_to_attr[reg*2+1])) 
+      if (linear_mask & (1 << (reg*2+1))) 
 	 *pc_linear |= 0xf0;
    }
 
@@ -513,24 +524,28 @@ void brw_emit_point_sprite_setup( struct brw_sf_compile *c, GLboolean allocate)
       alloc_regs(c);
 
    copy_z_inv_w(c);
+
    for (i = 0; i < c->nr_setup_regs; i++)
    {
-      struct brw_sf_point_tex *tex = &c->point_attrs[c->idx_to_attr[2*i]];
+      /* XXX: only seems to check point_coord_replace_attrs for every
+       * second attribute?!?
+       */
+      boolean coord_replace = !!(c->key.point_coord_replace_attrs & (1<<(2*i)));
       struct brw_reg a0 = offset(c->vert[0], i);
       GLushort pc, pc_persp, pc_linear;
       GLboolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
             
       if (pc_persp)
       {				
-	  if (!tex->CoordReplace) {
-	      brw_set_predicate_control_flag_value(p, pc_persp);
-	      brw_MUL(p, a0, a0, c->inv_w[0]);
-	  }
+	 if (coord_replace) {
+	    brw_set_predicate_control_flag_value(p, pc_persp);
+	    brw_MUL(p, a0, a0, c->inv_w[0]);
+	 }
       }
 
-      if (tex->CoordReplace) {
-	  /* Caculate 1.0/PointWidth */
-	  brw_math(&c->func,
+      if (coord_replace) {
+	 /* Caculate 1.0/PointWidth */
+	 brw_math(&c->func,
 		  c->tmp,
 		  BRW_MATH_FUNCTION_INV,
 		  BRW_MATH_SATURATE_NONE,
@@ -539,33 +554,37 @@ void brw_emit_point_sprite_setup( struct brw_sf_compile *c, GLboolean allocate)
 		  BRW_MATH_DATA_SCALAR,
 		  BRW_MATH_PRECISION_FULL);
 
-	  if (c->key.SpriteOrigin == GL_LOWER_LEFT) {
-	   	brw_MUL(p, c->m1Cx, c->tmp, c->inv_w[0]);
-		brw_MOV(p, vec1(suboffset(c->m1Cx, 1)), brw_imm_f(0.0));
-	  	brw_MUL(p, c->m2Cy, c->tmp, negate(c->inv_w[0]));
-		brw_MOV(p, vec1(suboffset(c->m2Cy, 0)), brw_imm_f(0.0));
-	  } else {
-	   	brw_MUL(p, c->m1Cx, c->tmp, c->inv_w[0]);
-		brw_MOV(p, vec1(suboffset(c->m1Cx, 1)), brw_imm_f(0.0));
-	  	brw_MUL(p, c->m2Cy, c->tmp, c->inv_w[0]);
-		brw_MOV(p, vec1(suboffset(c->m2Cy, 0)), brw_imm_f(0.0));
-	  }
-      } else {
-	  brw_MOV(p, c->m1Cx, brw_imm_ud(0));
-	  brw_MOV(p, c->m2Cy, brw_imm_ud(0));
+	 if (c->key.sprite_origin_lower_left) {
+	    brw_MUL(p, c->m1Cx, c->tmp, c->inv_w[0]);
+	    brw_MOV(p, vec1(suboffset(c->m1Cx, 1)), brw_imm_f(0.0));
+	    brw_MUL(p, c->m2Cy, c->tmp, negate(c->inv_w[0]));
+	    brw_MOV(p, vec1(suboffset(c->m2Cy, 0)), brw_imm_f(0.0));
+	 } 
+	 else {
+	    brw_MUL(p, c->m1Cx, c->tmp, c->inv_w[0]);
+	    brw_MOV(p, vec1(suboffset(c->m1Cx, 1)), brw_imm_f(0.0));
+	    brw_MUL(p, c->m2Cy, c->tmp, c->inv_w[0]);
+	    brw_MOV(p, vec1(suboffset(c->m2Cy, 0)), brw_imm_f(0.0));
+	 }
+      } 
+      else {
+	 brw_MOV(p, c->m1Cx, brw_imm_ud(0));
+	 brw_MOV(p, c->m2Cy, brw_imm_ud(0));
       }
 
       {
 	 brw_set_predicate_control_flag_value(p, pc); 
-	 if (tex->CoordReplace) {
-	     if (c->key.sprite_origin_lower_left) {
-		 brw_MUL(p, c->m3C0, c->inv_w[0], brw_imm_f(1.0));
-		 brw_MOV(p, vec1(suboffset(c->m3C0, 0)), brw_imm_f(0.0));
-	     }
-	     else
-		 brw_MOV(p, c->m3C0, brw_imm_f(0.0));
-	 } else {
-	 	brw_MOV(p, c->m3C0, a0); /* constant value */
+	 if (coord_replace) {
+	    if (c->key.sprite_origin_lower_left) {
+	       brw_MUL(p, c->m3C0, c->inv_w[0], brw_imm_f(1.0));
+	       brw_MOV(p, vec1(suboffset(c->m3C0, 0)), brw_imm_f(0.0));
+	    }
+	    else {
+	       brw_MOV(p, c->m3C0, brw_imm_f(0.0));
+	    }
+	 } 
+	 else {
+	    brw_MOV(p, c->m3C0, a0); /* constant value */
 	 }
 
 	 /* Copy m0..m3 to URB. 
diff --git a/src/gallium/drivers/i965/brw_sf_state.c b/src/gallium/drivers/i965/brw_sf_state.c
index 648a16a038c..fbc9f15eb43 100644
--- a/src/gallium/drivers/i965/brw_sf_state.c
+++ b/src/gallium/drivers/i965/brw_sf_state.c
@@ -29,58 +29,48 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
    
+#include "util/u_math.h"
 
+#include "pipe/p_state.h"
 
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
+#include "brw_debug.h"
+#include "brw_pipe_rast.h"
 
-static void upload_sf_vp(struct brw_context *brw)
+static int upload_sf_vp(struct brw_context *brw)
 {
-   const GLfloat depth_scale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
+   const struct pipe_viewport_state *vp = &brw->curr.vp;
+   const struct pipe_scissor_state *scissor = &brw->curr.scissor;
    struct brw_sf_viewport sfv;
-   GLfloat y_scale, y_bias;
-   const GLfloat *v = ctx->Viewport._WindowMap.m;
 
    memset(&sfv, 0, sizeof(sfv));
 
-   y_scale = 1.0;
-   y_bias = 0;
+   /* PIPE_NEW_VIEWPORT, PIPE_NEW_SCISSOR */
 
-   /* _NEW_VIEWPORT */
+   sfv.viewport.m00 = vp->scale[0];
+   sfv.viewport.m11 = vp->scale[1];
+   sfv.viewport.m22 = vp->scale[2];
+   sfv.viewport.m30 = vp->translate[0];
+   sfv.viewport.m31 = vp->translate[1];
+   sfv.viewport.m32 = vp->translate[2];
 
-   sfv.viewport.m00 = v[MAT_SX];
-   sfv.viewport.m11 = v[MAT_SY] * y_scale;
-   sfv.viewport.m22 = v[MAT_SZ] * depth_scale;
-   sfv.viewport.m30 = v[MAT_TX];
-   sfv.viewport.m31 = v[MAT_TY] * y_scale + y_bias;
-   sfv.viewport.m32 = v[MAT_TZ] * depth_scale;
-
-   /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT
-    * for DrawBuffer->_[XY]{min,max}
-    */
-
-   /* The scissor only needs to handle the intersection of drawable and
-    * scissor rect.
-    *
-    * Note that the hardware's coordinates are inclusive, while Mesa's min is
-    * inclusive but max is exclusive.
-    */
-   /* Y=0=bottom */
-   sfv.scissor.xmin = ctx->DrawBuffer->_Xmin;
-   sfv.scissor.xmax = ctx->DrawBuffer->_Xmax - 1;
-   sfv.scissor.ymin = ctx->DrawBuffer->_Ymin;
-   sfv.scissor.ymax = ctx->DrawBuffer->_Ymax - 1;
+   sfv.scissor.xmin = scissor->minx;
+   sfv.scissor.xmax = scissor->maxx; /* -1 ?? */
+   sfv.scissor.ymin = scissor->miny;
+   sfv.scissor.ymax = scissor->maxy; /* -1 ?? */
 
    brw->sws->bo_unreference(brw->sf.vp_bo);
    brw->sf.vp_bo = brw_cache_data( &brw->cache, BRW_SF_VP, &sfv, NULL, 0 );
+
+   return 0;
 }
 
 const struct brw_tracked_state brw_sf_vp = {
    .dirty = {
-      .mesa  = (_NEW_VIEWPORT | 
-		_NEW_SCISSOR |
-		_NEW_BUFFERS),
+      .mesa  = (PIPE_NEW_VIEWPORT | 
+		PIPE_NEW_SCISSOR),
       .brw   = 0,
       .cache = 0
    },
@@ -90,15 +80,17 @@ const struct brw_tracked_state brw_sf_vp = {
 struct brw_sf_unit_key {
    unsigned int total_grf;
    unsigned int urb_entry_read_length;
-
    unsigned int nr_urb_entries, urb_size, sfsize;
-
-   GLenum front_face, cull_face, provoking_vertex;
+   
    unsigned scissor:1;
    unsigned line_smooth:1;
    unsigned point_sprite:1;
    unsigned point_attenuated:1;
-   unsigned render_to_fbo:1;
+   unsigned front_face:2;
+   unsigned cull_mode:2;
+   unsigned flatshade_first:1;
+   unsigned gl_rasterization_rules:1;
+   unsigned line_last_pixel_enable:1;
    float line_width;
    float point_size;
 };
@@ -106,6 +98,7 @@ struct brw_sf_unit_key {
 static void
 sf_unit_populate_key(struct brw_context *brw, struct brw_sf_unit_key *key)
 {
+   const struct pipe_rasterizer_state *rast = &brw->curr.rast->templ;
    memset(key, 0, sizeof(*key));
 
    /* CACHE_NEW_SF_PROG */
@@ -117,25 +110,22 @@ sf_unit_populate_key(struct brw_context *brw, struct brw_sf_unit_key *key)
    key->urb_size = brw->urb.vsize;
    key->sfsize = brw->urb.sfsize;
 
-   key->scissor = ctx->Scissor.Enabled;
-   key->front_face = ctx->Polygon.FrontFace;
-
-   if (ctx->Polygon.CullFlag)
-      key->cull_face = ctx->Polygon.CullFaceMode;
-   else
-      key->cull_face = GL_NONE;
-
-   key->line_width = ctx->Line.Width;
-   key->line_smooth = ctx->Line.SmoothFlag;
-
-   key->point_sprite = ctx->Point.PointSprite;
-   key->point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
-   key->point_attenuated = ctx->Point._Attenuated;
-
-   /* _NEW_LIGHT */
-   key->provoking_vertex = ctx->Light.ProvokingVertex;
-
-   key->render_to_fbo = 1;
+   /* PIPE_NEW_RAST */
+   key->scissor = rast->scissor;
+   key->front_face = rast->front_winding;
+   key->cull_mode = rast->cull_mode;
+   key->line_smooth = rast->line_smooth;
+   key->line_width = rast->line_width;
+   key->flatshade_first = rast->flatshade_first;
+   key->line_last_pixel_enable = rast->line_last_pixel;
+   key->gl_rasterization_rules = rast->gl_rasterization_rules;
+
+   key->point_sprite = rast->point_sprite;
+   key->point_attenuated = rast->point_size_per_vertex;
+
+   key->point_size = CLAMP(rast->point_size, 
+			   rast->point_size_min, 
+			   rast->point_size_max);
 }
 
 static struct brw_winsys_buffer *
@@ -147,7 +137,7 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
    int chipset_max_threads;
    memset(&sf, 0, sizeof(sf));
 
-   sf.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1;
+   sf.thread0.grf_reg_count = align(key->total_grf, 16) / 16 - 1;
    sf.thread0.kernel_start_pointer = brw->sf.prog_bo->offset >> 6; /* reloc */
 
    sf.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
@@ -174,10 +164,10 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
 
    sf.thread4.max_threads = MIN2(chipset_max_threads, key->nr_urb_entries) - 1;
 
-   if (INTEL_DEBUG & DEBUG_SINGLE_THREAD)
+   if (BRW_DEBUG & DEBUG_SINGLE_THREAD)
       sf.thread4.max_threads = 0;
 
-   if (INTEL_DEBUG & DEBUG_STATS)
+   if (BRW_DEBUG & DEBUG_STATS)
       sf.thread4.stats_enable = 1;
 
    /* CACHE_NEW_SF_VP */
@@ -185,31 +175,30 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
 
    sf.sf5.viewport_transform = 1;
 
-   /* _NEW_SCISSOR */
    if (key->scissor)
       sf.sf6.scissor = 1;
 
-   /* _NEW_POLYGON */
-   if (key->front_face == GL_CCW)
+   if (key->front_face == PIPE_WINDING_CCW)
       sf.sf5.front_winding = BRW_FRONTWINDING_CCW;
    else
       sf.sf5.front_winding = BRW_FRONTWINDING_CW;
 
-   switch (key->cull_face) {
-   case GL_FRONT:
-      sf.sf6.cull_mode = BRW_CULLMODE_FRONT;
+   switch (key->cull_mode) {
+   case PIPE_WINDING_CCW:
+   case PIPE_WINDING_CW:
+      sf.sf6.cull_mode = (key->front_face == key->cull_mode ?
+			  BRW_CULLMODE_FRONT :
+			  BRW_CULLMODE_BACK);
       break;
-   case GL_BACK:
-      sf.sf6.cull_mode = BRW_CULLMODE_BACK;
-      break;
-   case GL_FRONT_AND_BACK:
+   case PIPE_WINDING_BOTH:
       sf.sf6.cull_mode = BRW_CULLMODE_BOTH;
       break;
-   case GL_NONE:
+   case PIPE_WINDING_NONE:
       sf.sf6.cull_mode = BRW_CULLMODE_NONE;
       break;
    default:
       assert(0);
+      sf.sf6.cull_mode = BRW_CULLMODE_NONE;
       break;
    }
 
@@ -223,9 +212,9 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
    else if (sf.sf6.line_width <= 0x2)
        sf.sf6.line_width = 0;
 
-   /* _NEW_BUFFERS */
-   key->render_to_fbo = 1;
-   if (!key->render_to_fbo) {
+   /* XXX: gl_rasterization_rules?  something else?
+    */
+   if (0) {
       /* Rendering to an OpenGL window */
       sf.sf6.point_rast_rule = BRW_RASTRULE_UPPER_RIGHT;
    }
@@ -261,7 +250,7 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
 
    /* might be BRW_NEW_PRIMITIVE if we have to adjust pv for polygons:
     */
-   if (key->provoking_vertex == GL_LAST_VERTEX_CONVENTION) {
+   if (!key->flatshade_first) {
       sf.sf7.trifan_pv = 2;
       sf.sf7.linestrip_pv = 1;
       sf.sf7.tristrip_pv = 2;
@@ -270,12 +259,19 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
       sf.sf7.linestrip_pv = 0;
       sf.sf7.tristrip_pv = 0;
    }
-   sf.sf7.line_last_pixel_enable = 0;
+
+   sf.sf7.line_last_pixel_enable = key->line_last_pixel_enable;
 
    /* Set bias for OpenGL rasterization rules:
     */
-   sf.sf6.dest_org_vbias = 0x8;
-   sf.sf6.dest_org_hbias = 0x8;
+   if (key->gl_rasterization_rules) {
+      sf.sf6.dest_org_vbias = 0x8;
+      sf.sf6.dest_org_hbias = 0x8;
+   }
+   else {
+      sf.sf6.dest_org_vbias = 0x0;
+      sf.sf6.dest_org_hbias = 0x0;
+   }
 
    bo = brw_upload_cache(&brw->cache, BRW_SF_UNIT,
 			 key, sizeof(*key),
@@ -287,23 +283,23 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
     * something loaded through the GPE (L2 ISC), so it's INSTRUCTION domain.
     */
    /* Emit SF program relocation */
-   dri_bo_emit_reloc(bo,
-		     I915_GEM_DOMAIN_INSTRUCTION, 0,
-		     sf.thread0.grf_reg_count << 1,
-		     offsetof(struct brw_sf_unit_state, thread0),
-		     brw->sf.prog_bo);
+   brw->sws->bo_emit_reloc(bo,
+			   I915_GEM_DOMAIN_INSTRUCTION, 0,
+			   sf.thread0.grf_reg_count << 1,
+			   offsetof(struct brw_sf_unit_state, thread0),
+			   brw->sf.prog_bo);
 
    /* Emit SF viewport relocation */
-   dri_bo_emit_reloc(bo,
-		     I915_GEM_DOMAIN_INSTRUCTION, 0,
-		     sf.sf5.front_winding | (sf.sf5.viewport_transform << 1),
-		     offsetof(struct brw_sf_unit_state, sf5),
-		     brw->sf.vp_bo);
+   brw->sws->bo_emit_reloc(bo,
+			   I915_GEM_DOMAIN_INSTRUCTION, 0,
+			   sf.sf5.front_winding | (sf.sf5.viewport_transform << 1),
+			   offsetof(struct brw_sf_unit_state, sf5),
+			   brw->sf.vp_bo);
 
    return bo;
 }
 
-static void upload_sf_unit( struct brw_context *brw )
+static int upload_sf_unit( struct brw_context *brw )
 {
    struct brw_sf_unit_key key;
    struct brw_winsys_buffer *reloc_bufs[2];
@@ -321,16 +317,12 @@ static void upload_sf_unit( struct brw_context *brw )
    if (brw->sf.state_bo == NULL) {
       brw->sf.state_bo = sf_unit_create_from_key(brw, &key, reloc_bufs);
    }
+   return 0;
 }
 
 const struct brw_tracked_state brw_sf_unit = {
    .dirty = {
-      .mesa  = (_NEW_POLYGON | 
-		_NEW_LIGHT |
-		_NEW_LINE | 
-		_NEW_POINT | 
-		_NEW_SCISSOR |
-		_NEW_BUFFERS),
+      .mesa  = (PIPE_NEW_RAST),
       .brw   = BRW_NEW_URB_FENCE,
       .cache = (CACHE_NEW_SF_VP |
 		CACHE_NEW_SF_PROG)
diff --git a/src/gallium/drivers/i965/brw_state.h b/src/gallium/drivers/i965/brw_state.h
index 663fc839dfe..2275e9ad690 100644
--- a/src/gallium/drivers/i965/brw_state.h
+++ b/src/gallium/drivers/i965/brw_state.h
@@ -168,9 +168,20 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
 void brw_destroy_batch_cache( struct brw_context *brw );
 void brw_clear_batch_cache( struct brw_context *brw );
 
-/* brw_wm_surface_state.c */
+/***********************************************************************
+ * brw_wm_surface_state.c 
+ */
 struct brw_winsys_buffer *
 brw_create_constant_surface( struct brw_context *brw,
                              struct brw_surface_key *key );
 
+/***********************************************************************
+ * brw_state_debug.c
+ */
+void brw_update_dirty_counts( unsigned mesa,
+			      unsigned brw,
+			      unsigned cache );
+
+
+
 #endif
diff --git a/src/gallium/drivers/i965/brw_state_batch.c b/src/gallium/drivers/i965/brw_state_batch.c
index 324fce51636..7d212e5c247 100644
--- a/src/gallium/drivers/i965/brw_state_batch.c
+++ b/src/gallium/drivers/i965/brw_state_batch.c
@@ -46,7 +46,7 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
    struct brw_cached_batch_item *item = brw->cached_batch_items;
    struct header *newheader = (struct header *)data;
 
-   if (brw->emit_state_always) {
+   if (brw->flags.always_emit_state) {
       brw_batchbuffer_data(brw->batch, data, sz, IGNORE_CLIPRECTS);
       return GL_TRUE;
    }
@@ -56,8 +56,8 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
 	 if (item->sz == sz && memcmp(item->header, newheader, sz) == 0)
 	    return GL_FALSE;
 	 if (item->sz != sz) {
-	    _mesa_free(item->header);
-	    item->header = _mesa_malloc(sz);
+	    FREE(item->header);
+	    item->header = MALLOC(sz);
 	    item->sz = sz;
 	 }
 	 goto emit;
@@ -67,7 +67,7 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
 
    assert(!item);
    item = CALLOC_STRUCT(brw_cached_batch_item);
-   item->header = _mesa_malloc(sz);
+   item->header = MALLOC(sz);
    item->sz = sz;
    item->next = brw->cached_batch_items;
    brw->cached_batch_items = item;
diff --git a/src/gallium/drivers/i965/brw_state_cache.c b/src/gallium/drivers/i965/brw_state_cache.c
index 97f88b3ab39..4310d01ba2e 100644
--- a/src/gallium/drivers/i965/brw_state_cache.c
+++ b/src/gallium/drivers/i965/brw_state_cache.c
@@ -55,7 +55,9 @@
  * only one of the two buffers referenced gets put into the offset, and the
  * incorrect program is run for the other instance.
  */
+#include "util/u_memory.h"
 
+#include "brw_debug.h"
 #include "brw_state.h"
 #include "brw_batchbuffer.h"
 
@@ -107,9 +109,9 @@ update_cache_last(struct brw_cache *cache, enum brw_cache_id cache_id,
    if (bo == cache->last_bo[cache_id])
       return; /* no change */
 
-   brw->sws->bo_unreference(cache->last_bo[cache_id]);
+   cache->sws->bo_unreference(cache->last_bo[cache_id]);
    cache->last_bo[cache_id] = bo;
-   brw->sws->bo_reference(cache->last_bo[cache_id]);
+   cache->sws->bo_reference(cache->last_bo[cache_id]);
    cache->brw->state.dirty.cache |= 1 << cache_id;
 }
 
@@ -127,7 +129,7 @@ search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
    for (c = cache->items[hash % cache->size]; c; c = c->next)
       bucketcount++;
 
-   fprintf(stderr, "bucket %d/%d = %d/%d items\n", hash % cache->size,
+   debug_printf("bucket %d/%d = %d/%d items\n", hash % cache->size,
 	   cache->size, bucketcount, cache->n_items);
 #endif
 
@@ -154,7 +156,7 @@ rehash(struct brw_cache *cache)
    GLuint size, i;
 
    size = cache->size * 3;
-   items = (struct brw_cache_item**) _mesa_calloc(size * sizeof(*items));
+   items = (struct brw_cache_item**) CALLOC(size, sizeof(*items));
 
    for (i = 0; i < cache->size; i++)
       for (c = cache->items[i]; c; c = next) {
@@ -194,7 +196,7 @@ brw_search_cache(struct brw_cache *cache,
 
    update_cache_last(cache, cache_id, item->bo);
 
-   brw->sws->bo_reference(item->bo);
+   cache->sws->bo_reference(item->bo);
    return item->bo;
 }
 
@@ -219,20 +221,25 @@ brw_upload_cache( struct brw_cache *cache,
    struct brw_winsys_buffer *bo;
    int i;
 
-   /* Create the buffer object to contain the data */
-   bo = brw->sws->bo_alloc(cache->sws,
-			   cache->buffer_type[cache_id], data_size, 1 << 6);
+   /* Create the buffer object to contain the data.  For now, use a
+    * single buffer type to describe all cached state atoms.  Later,
+    * may want to take advantage of hardware distinctions between
+    * these various entities.
+    */
+   bo = cache->sws->bo_alloc(cache->sws,
+			     BRW_BUFFER_TYPE_STATE_CACHE, 
+			     data_size, 1 << 6);
 
 
    /* Set up the memory containing the key, aux_data, and reloc_bufs */
-   tmp = _mesa_malloc(key_size + aux_size + relocs_size);
+   tmp = MALLOC(key_size + aux_size + relocs_size);
 
    memcpy(tmp, key, key_size);
    memcpy(tmp + key_size, aux, cache->aux_size[cache_id]);
    memcpy(tmp + key_size + aux_size, reloc_bufs, relocs_size);
    for (i = 0; i < nr_reloc_bufs; i++) {
       if (reloc_bufs[i] != NULL)
-	 brw->sws->bo_reference(reloc_bufs[i]);
+	 cache->sws->bo_reference(reloc_bufs[i]);
    }
 
    item->cache_id = cache_id;
@@ -243,7 +250,7 @@ brw_upload_cache( struct brw_cache *cache,
    item->nr_reloc_bufs = nr_reloc_bufs;
 
    item->bo = bo;
-   brw->sws->bo_reference(bo);
+   cache->sws->bo_reference(bo);
    item->data_size = data_size;
 
    if (cache->n_items > cache->size * 1.5)
@@ -259,13 +266,13 @@ brw_upload_cache( struct brw_cache *cache,
       *(void **)aux_return = (void *)((char *)item->key + item->key_size);
    }
 
-   if (INTEL_DEBUG & DEBUG_STATE)
-      _mesa_printf("upload %s: %d bytes to cache id %d\n",
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("upload %s: %d bytes to cache id %d\n",
 		   cache->name[cache_id],
 		   data_size, cache_id);
 
    /* Copy data to the buffer */
-   dri_bo_subdata(bo, 0, data_size, data);
+   cache->sws->bo_subdata(bo, 0, data_size, data);
 
    update_cache_last(cache, cache_id, bo);
 
@@ -292,7 +299,7 @@ brw_cache_data_sz(struct brw_cache *cache,
 		       reloc_bufs, nr_reloc_bufs);
    if (item) {
       update_cache_last(cache, cache_id, item->bo);
-      brw->sws->bo_reference(item->bo);
+      cache->sws->bo_reference(item->bo);
       return item->bo;
    }
 
@@ -349,11 +356,12 @@ brw_init_non_surface_cache(struct brw_context *brw)
    struct brw_cache *cache = &brw->cache;
 
    cache->brw = brw;
+   cache->sws = brw->sws;
 
    cache->size = 7;
    cache->n_items = 0;
    cache->items = (struct brw_cache_item **)
-      _mesa_calloc(cache->size * sizeof(struct brw_cache_item));
+      CALLOC(cache->size, sizeof(struct brw_cache_item));
 
    brw_init_cache_id(cache,
 		     "CC_VP",
@@ -457,7 +465,7 @@ brw_init_surface_cache(struct brw_context *brw)
    cache->size = 7;
    cache->n_items = 0;
    cache->items = (struct brw_cache_item **)
-      _mesa_calloc(cache->size * sizeof(struct brw_cache_item));
+      CALLOC(cache->size, sizeof(struct brw_cache_item));
 
    brw_init_cache_id(cache,
 		     "SS_SURFACE",
@@ -487,8 +495,8 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
    struct brw_cache_item *c, *next;
    GLuint i;
 
-   if (INTEL_DEBUG & DEBUG_STATE)
-      _mesa_printf("%s\n", __FUNCTION__);
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("%s\n", __FUNCTION__);
 
    for (i = 0; i < cache->size; i++) {
       for (c = cache->items[i]; c; c = next) {
@@ -507,7 +515,7 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
    cache->n_items = 0;
 
    if (brw->curbe.last_buf) {
-      _mesa_free(brw->curbe.last_buf);
+      FREE(brw->curbe.last_buf);
       brw->curbe.last_buf = NULL;
    }
 
@@ -527,8 +535,8 @@ brw_state_cache_bo_delete(struct brw_cache *cache, struct brw_winsys_buffer *bo)
    struct brw_cache_item **prev;
    GLuint i;
 
-   if (INTEL_DEBUG & DEBUG_STATE)
-      _mesa_printf("%s\n", __FUNCTION__);
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("%s\n", __FUNCTION__);
 
    for (i = 0; i < cache->size; i++) {
       for (prev = &cache->items[i]; *prev;) {
@@ -540,8 +548,8 @@ brw_state_cache_bo_delete(struct brw_cache *cache, struct brw_winsys_buffer *bo)
 	    *prev = c->next;
 
 	    for (j = 0; j < c->nr_reloc_bufs; j++)
-	       brw->sws->bo_unreference(c->reloc_bufs[j]);
-	    brw->sws->bo_unreference(c->bo);
+	       cache->sws->bo_unreference(c->reloc_bufs[j]);
+	    cache->sws->bo_unreference(c->bo);
 	    free((void *)c->key);
 	    free(c);
 	    cache->n_items--;
@@ -555,8 +563,8 @@ brw_state_cache_bo_delete(struct brw_cache *cache, struct brw_winsys_buffer *bo)
 void
 brw_state_cache_check_size(struct brw_context *brw)
 {
-   if (INTEL_DEBUG & DEBUG_STATE)
-      _mesa_printf("%s (n_items=%d)\n", __FUNCTION__, brw->cache.n_items);
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("%s (n_items=%d)\n", __FUNCTION__, brw->cache.n_items);
 
    /* un-tuned guess.  We've got around 20 state objects for a total of around
     * 32k, so 1000 of them is around 1.5MB.
@@ -574,8 +582,8 @@ brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
 {
    GLuint i;
 
-   if (INTEL_DEBUG & DEBUG_STATE)
-      _mesa_printf("%s\n", __FUNCTION__);
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("%s\n", __FUNCTION__);
 
    brw_clear_cache(brw, cache);
    for (i = 0; i < BRW_MAX_CACHE; i++) {
diff --git a/src/gallium/drivers/i965/brw_state_debug.c b/src/gallium/drivers/i965/brw_state_debug.c
index 22cea4b7d85..cc4744dc166 100644
--- a/src/gallium/drivers/i965/brw_state_debug.c
+++ b/src/gallium/drivers/i965/brw_state_debug.c
@@ -109,8 +109,25 @@ brw_print_dirty_count(struct dirty_bit_map *bit_map, int32_t bits)
       if (bit_map[i].bit == 0)
 	 return;
 
-      fprintf(stderr, "0x%08x: %12d (%s)\n",
+      debug_printf("0x%08x: %12d (%s)\n",
 	      bit_map[i].bit, bit_map[i].count, bit_map[i].name);
    }
 }
 
+void
+brw_update_dirty_counts( unsigned mesa,
+			 unsigned brw,
+			 unsigned cache )
+{
+   static int dirty_count = 0;
+
+   brw_update_dirty_count(mesa_bits, mesa);
+   brw_update_dirty_count(brw_bits, brw);
+   brw_update_dirty_count(cache_bits, cache);
+      if (dirty_count++ % 1000 == 0) {
+	 brw_print_dirty_count(mesa_bits, mesa);
+	 brw_print_dirty_count(brw_bits, brw);
+	 brw_print_dirty_count(cache_bits, cache);
+	 debug_printf("\n");
+      }
+}
diff --git a/src/gallium/drivers/i965/brw_state_dump.c b/src/gallium/drivers/i965/brw_state_dump.c
index 1bc83fb9c15..72604304d41 100644
--- a/src/gallium/drivers/i965/brw_state_dump.c
+++ b/src/gallium/drivers/i965/brw_state_dump.c
@@ -28,6 +28,7 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
+#include "brw_winsys.h"
 
 /**
  * Prints out a header, the contents, and the message associated with
@@ -44,28 +45,32 @@ state_out(const char *name, void *data, uint32_t hw_offset, int index,
 {
     va_list va;
 
-    fprintf(stderr, "%8s: 0x%08x: 0x%08x: ",
-	    name, hw_offset + index * 4, ((uint32_t *)data)[index]);
+    debug_printf("%8s: 0x%08x: 0x%08x: ",
+		 name, hw_offset + index * 4, ((uint32_t *)data)[index]);
     va_start(va, fmt);
-    vfprintf(stderr, fmt, va);
+    debug_vprintf(fmt, va);
     va_end(va);
 }
 
 /** Generic, undecoded state buffer debug printout */
 static void
-state_struct_out(const char *name, struct brw_winsys_buffer *buffer, unsigned int state_size)
+state_struct_out(struct brw_winsys_screen *sws,
+		 const char *name,
+		 struct brw_winsys_buffer *buffer,
+		 unsigned int state_size)
 {
    int i;
+   void *data;
 
    if (buffer == NULL)
       return;
 
-   dri_bo_map(buffer, GL_FALSE);
+   data = sws->bo_map(buffer, GL_FALSE);
    for (i = 0; i < state_size / 4; i++) {
-      state_out(name, buffer->virtual, buffer->offset, i,
+      state_out(name, data, buffer->offset, i,
 		"dword %d\n", i);
    }
-   dri_bo_unmap(buffer);
+   sws->bo_unmap(buffer);
 }
 
 static const char *
@@ -106,12 +111,11 @@ static void dump_wm_surface_state(struct brw_context *brw)
       char name[20];
 
       if (surf_bo == NULL) {
-	 fprintf(stderr, "  WM SS%d: NULL\n", i);
+	 debug_printf("  WM SS%d: NULL\n", i);
 	 continue;
       }
-      dri_bo_map(surf_bo, GL_FALSE);
+      surf = (struct brw_surface_state *)brw->sws->bo_map(surf_bo, GL_FALSE);
       surfoff = surf_bo->offset;
-      surf = (struct brw_surface_state *)(surf_bo->virtual);
 
       sprintf(name, "WM SS%d", i);
       state_out(name, surf, surfoff, 0, "%s %s\n",
@@ -127,7 +131,7 @@ static void dump_wm_surface_state(struct brw_context *brw)
       state_out(name, surf, surfoff, 5, "x,y offset: %d,%d\n",
 		surf->ss5.x_offset, surf->ss5.y_offset);
 
-      dri_bo_unmap(surf_bo);
+      brw->sws->bo_unmap(surf_bo);
    }
 }
 
@@ -140,9 +144,7 @@ static void dump_sf_viewport_state(struct brw_context *brw)
    if (brw->sf.vp_bo == NULL)
       return;
 
-   dri_bo_map(brw->sf.vp_bo, GL_FALSE);
-
-   vp = brw->sf.vp_bo->virtual;
+   vp = (struct brw_sf_viewport *)brw->sws->bo_map(brw->sf.vp_bo, GL_FALSE);
    vp_off = brw->sf.vp_bo->offset;
 
    state_out(name, vp, vp_off, 0, "m00 = %f\n", vp->viewport.m00);
@@ -157,10 +159,12 @@ static void dump_sf_viewport_state(struct brw_context *brw)
    state_out(name, vp, vp_off, 7, "bottom right = %d,%d\n",
 	     vp->scissor.xmax, vp->scissor.ymax);
 
-   dri_bo_unmap(brw->sf.vp_bo);
+   brw->sws->bo_unmap(brw->sf.vp_bo);
 }
 
-static void brw_debug_prog(const char *name, struct brw_winsys_buffer *prog)
+static void brw_debug_prog(struct brw_winsys_screen *sws,
+			   const char *name,
+			   struct brw_winsys_buffer *prog)
 {
    unsigned int i;
    uint32_t *data;
@@ -168,12 +172,10 @@ static void brw_debug_prog(const char *name, struct brw_winsys_buffer *prog)
    if (prog == NULL)
       return;
 
-   dri_bo_map(prog, GL_FALSE);
-
-   data = prog->virtual;
+   data = (uint32_t *)sws->bo_map(prog, GL_FALSE);
 
    for (i = 0; i < prog->size / 4 / 4; i++) {
-      fprintf(stderr, "%8s: 0x%08x: 0x%08x 0x%08x 0x%08x 0x%08x\n",
+      debug_printf("%8s: 0x%08x: 0x%08x 0x%08x 0x%08x 0x%08x\n",
 	      name, (unsigned int)prog->offset + i * 4 * 4,
 	      data[i * 4], data[i * 4 + 1], data[i * 4 + 2], data[i * 4 + 3]);
       /* Stop at the end of the program.  It'd be nice to keep track of the actual
@@ -186,7 +188,7 @@ static void brw_debug_prog(const char *name, struct brw_winsys_buffer *prog)
 	 break;
    }
 
-   dri_bo_unmap(prog);
+   sws->bo_unmap(prog);
 }
 
 
@@ -202,19 +204,21 @@ static void brw_debug_prog(const char *name, struct brw_winsys_buffer *prog)
  */
 void brw_debug_batch(struct brw_context *brw)
 {
-   state_struct_out("WM bind", brw->wm.bind_bo, 4 * brw->wm.nr_surfaces);
+   struct brw_winsys_screen *sws = brw->sws;
+
+   state_struct_out(sws, "WM bind", brw->wm.bind_bo, 4 * brw->wm.nr_surfaces);
    dump_wm_surface_state(brw);
 
-   state_struct_out("VS", brw->vs.state_bo, sizeof(struct brw_vs_unit_state));
-   brw_debug_prog("VS prog", brw->vs.prog_bo);
+   state_struct_out(sws, "VS", brw->vs.state_bo, sizeof(struct brw_vs_unit_state));
+   brw_debug_prog(sws, "VS prog", brw->vs.prog_bo);
 
-   state_struct_out("GS", brw->gs.state_bo, sizeof(struct brw_gs_unit_state));
-   brw_debug_prog("GS prog", brw->gs.prog_bo);
+   state_struct_out(sws, "GS", brw->gs.state_bo, sizeof(struct brw_gs_unit_state));
+   brw_debug_prog(sws, "GS prog", brw->gs.prog_bo);
 
-   state_struct_out("SF", brw->sf.state_bo, sizeof(struct brw_sf_unit_state));
+   state_struct_out(sws, "SF", brw->sf.state_bo, sizeof(struct brw_sf_unit_state));
    dump_sf_viewport_state(brw);
-   brw_debug_prog("SF prog", brw->sf.prog_bo);
+   brw_debug_prog(sws, "SF prog", brw->sf.prog_bo);
 
-   state_struct_out("WM", brw->wm.state_bo, sizeof(struct brw_wm_unit_state));
-   brw_debug_prog("WM prog", brw->wm.prog_bo);
+   state_struct_out(sws, "WM", brw->wm.state_bo, sizeof(struct brw_wm_unit_state));
+   brw_debug_prog(sws, "WM prog", brw->wm.prog_bo);
 }
diff --git a/src/gallium/drivers/i965/brw_state_upload.c b/src/gallium/drivers/i965/brw_state_upload.c
index 8659e35289f..eff3a40a464 100644
--- a/src/gallium/drivers/i965/brw_state_upload.c
+++ b/src/gallium/drivers/i965/brw_state_upload.c
@@ -34,6 +34,7 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_batchbuffer.h"
+#include "brw_debug.h"
 
 /* This is used to initialize brw->state.atoms[].  We could use this
  * list directly except for a single atom, brw_constant_buffer, which
@@ -83,12 +84,8 @@ const struct brw_tracked_state *atoms[] =
    &brw_blend_constant_color,
 
    &brw_depthbuffer,
-
    &brw_polygon_stipple,
-   &brw_polygon_stipple_offset,
-
    &brw_line_stipple,
-   &brw_aa_line_parameters,
 
    &brw_psp_urb_cbs,
 
@@ -163,11 +160,12 @@ enum pipe_error brw_validate_state( struct brw_context *brw )
 {
    struct brw_state_flags *state = &brw->state.dirty;
    GLuint i;
+   int ret;
 
    brw_clear_validated_bos(brw);
-   brw_add_validated_bo(brw, intel->batch->buf);
+   brw_add_validated_bo(brw, brw->batch->buf);
 
-   if (brw->emit_state_always) {
+   if (brw->flags.always_emit_state) {
       state->mesa |= ~0;
       state->brw |= ~0;
       state->cache |= ~0;
@@ -199,10 +197,10 @@ enum pipe_error brw_validate_state( struct brw_context *brw )
     * If this fails, we can experience GPU lock-ups.
     */
    {
-      const struct brw_fragment_program *fp = brw->fragment_program;
+      const struct brw_fragment_shader *fp = brw->curr.fragment_shader;
       if (fp) {
-         assert(fp->info.max_sampler <= brw->nr_samplers &&
-		fp->info.max_texture <= brw->nr_textures);
+         assert(fp->info.file_max[TGSI_FILE_SAMPLER] < brw->curr.num_samplers &&
+		fp->info.texture_max < brw->curr.num_textures);
       }
    }
 
@@ -213,18 +211,18 @@ enum pipe_error brw_validate_state( struct brw_context *brw )
 enum pipe_error brw_upload_state(struct brw_context *brw)
 {
    struct brw_state_flags *state = &brw->state.dirty;
+   int ret;
    int i;
-   static int dirty_count = 0;
 
    brw_clear_validated_bos(brw);
 
-   if (INTEL_DEBUG) {
+   if (BRW_DEBUG) {
       /* Debug version which enforces various sanity checks on the
        * state flags which are generated and checked to help ensure
        * state atoms are ordered correctly in the list.
        */
       struct brw_state_flags examined, prev;      
-      _mesa_memset(&examined, 0, sizeof(examined));
+      memset(&examined, 0, sizeof(examined));
       prev = *state;
 
       for (i = 0; i < Elements(atoms); i++) {
@@ -268,19 +266,14 @@ enum pipe_error brw_upload_state(struct brw_context *brw)
       }
    }
 
-   if (INTEL_DEBUG & DEBUG_STATE) {
-      brw_update_dirty_count(mesa_bits, state->mesa);
-      brw_update_dirty_count(brw_bits, state->brw);
-      brw_update_dirty_count(cache_bits, state->cache);
-      if (dirty_count++ % 1000 == 0) {
-	 brw_print_dirty_count(mesa_bits, state->mesa);
-	 brw_print_dirty_count(brw_bits, state->brw);
-	 brw_print_dirty_count(cache_bits, state->cache);
-	 debug_printf("\n");
-      }
+   if (BRW_DEBUG & DEBUG_STATE) {
+      brw_update_dirty_counts( state->mesa, 
+			       state->brw,
+			       state->cache );
    }
    
    /* Clear dirty flags:
     */
    memset(state, 0, sizeof(*state));
+   return 0;
 }
diff --git a/src/gallium/drivers/i965/brw_tex.c b/src/gallium/drivers/i965/brw_tex.c
deleted file mode 100644
index 6f7adb6393c..00000000000
--- a/src/gallium/drivers/i965/brw_tex.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
- 
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- 
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
- 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- 
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-        
-
-#include "brw_context.h"
-
-/**
- * Finalizes all textures, completing any rendering that needs to be done
- * to prepare them.
- */
-void brw_validate_textures( struct brw_context *brw )
-{
-   int i;
-
-   for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
-      struct gl_texture_unit *texUnit = &ctx->Texture.Unit[i];
-
-      if (texUnit->_ReallyEnabled) {
-	 intel_finalize_mipmap_tree(intel, i);
-      }
-   }
-}
diff --git a/src/gallium/drivers/i965/brw_tex_layout.c b/src/gallium/drivers/i965/brw_tex_layout.c
deleted file mode 100644
index 50c30878c6f..00000000000
--- a/src/gallium/drivers/i965/brw_tex_layout.c
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
- 
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- 
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
- 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- 
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-/* Code to layout images in a mipmap tree for i965.
- */
-
-#include "brw_tex_layout.h"
-
-#define FILE_DEBUG_FLAG DEBUG_MIPTREE
-
-GLboolean brw_miptree_layout(struct brw_context *brw,
-			     struct intel_mipmap_tree *mt,
-			     uint32_t tiling)
-{
-   /* XXX: these vary depending on image format: */
-   /* GLint align_w = 4; */
-
-   switch (mt->target) {
-   case GL_TEXTURE_CUBE_MAP:
-      if (IS_IGDNG(brw->brw_screen->pci_id)) {
-          GLuint align_h = 2, align_w = 4;
-          GLuint level;
-          GLuint x = 0;
-          GLuint y = 0;
-          GLuint width = mt->width0;
-          GLuint height = mt->height0;
-          GLuint qpitch = 0;
-          GLuint y_pitch = 0;
-
-          mt->pitch = mt->width0;
-          intel_get_texture_alignment_unit(mt->internal_format, &align_w, &align_h);
-          y_pitch = ALIGN(height, align_h);
-
-          if (mt->compressed) {
-              mt->pitch = ALIGN(mt->width0, align_w);
-          }
-
-          if (mt->last_level != 0) {
-              GLuint mip1_width;
-
-              if (mt->compressed) {
-                  mip1_width = ALIGN(minify(mt->width0), align_w)
-                      + ALIGN(minify(minify(mt->width0)), align_w);
-              } else {
-                  mip1_width = ALIGN(minify(mt->width0), align_w)
-                      + minify(minify(mt->width0));
-              }
-
-              if (mip1_width > mt->pitch) {
-                  mt->pitch = mip1_width;
-              }
-          }
-
-          mt->pitch = intel_miptree_pitch_align(intel, mt, tiling, mt->pitch);
-
-          if (mt->compressed) {
-              qpitch = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) / 4 * mt->pitch * mt->cpp;
-              mt->total_height = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) / 4 * 6;
-          } else {
-              qpitch = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) * mt->pitch * mt->cpp;
-              mt->total_height = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) * 6;
-          }
-
-          for (level = 0; level <= mt->last_level; level++) {
-              GLuint img_height;
-              GLuint nr_images = 6;
-              GLuint q = 0;
-
-              intel_miptree_set_level_info(mt, level, nr_images, x, y, width, 
-                                           height, 1);
-
-              for (q = 0; q < nr_images; q++)
-                  intel_miptree_set_image_offset_ex(mt, level, q, x, y, q * qpitch);
-
-              if (mt->compressed)
-                  img_height = MAX2(1, height/4);
-              else
-                  img_height = ALIGN(height, align_h);
-
-              if (level == 1) {
-                  x += ALIGN(width, align_w);
-              }
-              else {
-                  y += img_height;
-              }
-
-              width  = minify(width);
-              height = minify(height);
-          }
-
-          break;
-      }
-
-   case GL_TEXTURE_3D: {
-      GLuint width  = mt->width0;
-      GLuint height = mt->height0;
-      GLuint depth = mt->depth0;
-      GLuint pack_x_pitch, pack_x_nr;
-      GLuint pack_y_pitch;
-      GLuint level;
-      GLuint align_h = 2;
-      GLuint align_w = 4;
-
-      mt->total_height = 0;
-      intel_get_texture_alignment_unit(mt->internal_format, &align_w, &align_h);
-
-      if (mt->compressed) {
-          mt->pitch = ALIGN(width, align_w);
-          pack_y_pitch = (height + 3) / 4;
-      } else {
-	 mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->width0);
-	 pack_y_pitch = ALIGN(mt->height0, align_h);
-      }
-
-      pack_x_pitch = width;
-      pack_x_nr = 1;
-
-      for (level = 0 ; level <= mt->last_level ; level++) {
-	 GLuint nr_images = mt->target == GL_TEXTURE_3D ? depth : 6;
-	 GLint x = 0;
-	 GLint y = 0;
-	 GLint q, j;
-
-	 intel_miptree_set_level_info(mt, level, nr_images,
-				      0, mt->total_height,
-				      width, height, depth);
-
-	 for (q = 0; q < nr_images;) {
-	    for (j = 0; j < pack_x_nr && q < nr_images; j++, q++) {
-	       intel_miptree_set_image_offset(mt, level, q, x, y);
-	       x += pack_x_pitch;
-	    }
-
-	    x = 0;
-	    y += pack_y_pitch;
-	 }
-
-
-	 mt->total_height += y;
-	 width  = minify(width);
-	 height = minify(height);
-	 depth  = minify(depth);
-
-	 if (mt->compressed) {
-	    pack_y_pitch = (height + 3) / 4;
-
-	    if (pack_x_pitch > ALIGN(width, align_w)) {
-	       pack_x_pitch = ALIGN(width, align_w);
-	       pack_x_nr <<= 1;
-	    }
-	 } else {
-	    if (pack_x_pitch > 4) {
-	       pack_x_pitch >>= 1;
-	       pack_x_nr <<= 1;
-	       assert(pack_x_pitch * pack_x_nr <= mt->pitch);
-	    }
-
-	    if (pack_y_pitch > 2) {
-	       pack_y_pitch >>= 1;
-	       pack_y_pitch = ALIGN(pack_y_pitch, align_h);
-	    }
-	 }
-
-      }
-      /* The 965's sampler lays cachelines out according to how accesses
-       * in the texture surfaces run, so they may be "vertical" through
-       * memory.  As a result, the docs say in Surface Padding Requirements:
-       * Sampling Engine Surfaces that two extra rows of padding are required.
-       * We don't know of similar requirements for pre-965, but given that
-       * those docs are silent on padding requirements in general, let's play
-       * it safe.
-       */
-      if (mt->target == GL_TEXTURE_CUBE_MAP)
-	 mt->total_height += 2;
-      break;
-   }
-
-   default:
-      i945_miptree_layout_2d(intel, mt, tiling);
-      break;
-   }
-   DBG("%s: %dx%dx%d - sz 0x%x\n", __FUNCTION__,
-		mt->pitch,
-		mt->total_height,
-		mt->cpp,
-		mt->pitch * mt->total_height * mt->cpp );
-
-   return GL_TRUE;
-}
-
diff --git a/src/gallium/drivers/i965/brw_urb.c b/src/gallium/drivers/i965/brw_urb.c
index a2277519ad7..ff2466528df 100644
--- a/src/gallium/drivers/i965/brw_urb.c
+++ b/src/gallium/drivers/i965/brw_urb.c
@@ -184,17 +184,17 @@ static void recalculate_urb_fence( struct brw_context *brw )
 	     * entries and the values for minimum nr of entries
 	     * provided above.
 	     */
-	    _mesa_printf("couldn't calculate URB layout!\n");
+	    debug_printf("couldn't calculate URB layout!\n");
 	    exit(1);
 	 }
 	 
-	 if (INTEL_DEBUG & (DEBUG_URB|DEBUG_FALLBACKS))
-	    _mesa_printf("URB CONSTRAINED\n");
+	 if (BRW_DEBUG & (DEBUG_URB|DEBUG_FALLBACKS))
+	    debug_printf("URB CONSTRAINED\n");
       }
 
 done:
-      if (INTEL_DEBUG & DEBUG_URB)
-	 _mesa_printf("URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
+      if (BRW_DEBUG & DEBUG_URB)
+	 debug_printf("URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
 		      brw->urb.vs_start,
 		      brw->urb.gs_start,
 		      brw->urb.clip_start,
diff --git a/src/gallium/drivers/i965/brw_vs.h b/src/gallium/drivers/i965/brw_vs.h
index 54f7d7d7c49..e33fa2f0aad 100644
--- a/src/gallium/drivers/i965/brw_vs.h
+++ b/src/gallium/drivers/i965/brw_vs.h
@@ -64,7 +64,7 @@ struct brw_vs_compile {
 
    struct brw_reg r0;
    struct brw_reg r1;
-   struct brw_reg regs[PROGRAM_ADDRESS+1][128];
+   struct brw_reg regs[TGSI_FILE_COUNT][128];
    struct brw_reg tmp;
    struct brw_reg stack;
 
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
index 086f54799e6..04132a167bc 100644
--- a/src/gallium/drivers/i965/brw_vs_emit.c
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -242,10 +242,10 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 
    c->prog_data.total_grf = reg;
 
-   if (INTEL_DEBUG & DEBUG_VS) {
-      _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
-      _mesa_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
-      _mesa_printf("%s reg = %d\n", __FUNCTION__, reg);
+   if (BRW_DEBUG & DEBUG_VS) {
+      debug_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
+      debug_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
+      debug_printf("%s reg = %d\n", __FUNCTION__, reg);
    }
 }
 
@@ -1248,10 +1248,10 @@ void brw_vs_emit(struct brw_vs_compile *c )
    GLuint index;
    GLuint file;
 
-   if (INTEL_DEBUG & DEBUG_VS) {
-      _mesa_printf("vs-mesa:\n");
+   if (BRW_DEBUG & DEBUG_VS) {
+      debug_printf("vs-mesa:\n");
       _mesa_print_program(&c->vp->program.Base); 
-      _mesa_printf("\n");
+      debug_printf("\n");
    }
 
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
@@ -1526,12 +1526,12 @@ void brw_vs_emit(struct brw_vs_compile *c )
 
    post_vs_emit(c, end_inst, last_inst);
 
-   if (INTEL_DEBUG & DEBUG_VS) {
+   if (BRW_DEBUG & DEBUG_VS) {
       int i;
 
-      _mesa_printf("vs-native:\n");
+      debug_printf("vs-native:\n");
       for (i = 0; i < p->nr_insn; i++)
 	 brw_disasm(stderr, &p->store[i]);
-      _mesa_printf("\n");
+      debug_printf("\n");
    }
 }
diff --git a/src/gallium/drivers/i965/brw_vs_state.c b/src/gallium/drivers/i965/brw_vs_state.c
index 1717223e49c..05a91f2de44 100644
--- a/src/gallium/drivers/i965/brw_vs_state.c
+++ b/src/gallium/drivers/i965/brw_vs_state.c
@@ -122,7 +122,7 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
    vs.thread4.max_threads = CLAMP(key->nr_urb_entries / 2,
 				  1, chipset_max_threads) - 1;
 
-   if (INTEL_DEBUG & DEBUG_SINGLE_THREAD)
+   if (BRW_DEBUG & DEBUG_SINGLE_THREAD)
       vs.thread4.max_threads = 0;
 
    /* No samplers for ARB_vp programs:
@@ -131,7 +131,7 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
     */
    vs.vs5.sampler_count = 0;
 
-   if (INTEL_DEBUG & DEBUG_STATS)
+   if (BRW_DEBUG & DEBUG_STATS)
       vs.thread4.stats_enable = 1;
 
    /* Vertex program always enabled:
diff --git a/src/gallium/drivers/i965/brw_winsys.h b/src/gallium/drivers/i965/brw_winsys.h
index 51e23b96404..33032276bc1 100644
--- a/src/gallium/drivers/i965/brw_winsys.h
+++ b/src/gallium/drivers/i965/brw_winsys.h
@@ -69,6 +69,7 @@ enum brw_buffer_type
    BRW_BUFFER_TYPE_SHADER_CONSTANTS,
    BRW_BUFFER_TYPE_WM_SCRATCH,
    BRW_BUFFER_TYPE_BATCH,
+   BRW_BUFFER_TYPE_STATE_CACHE,
 };
 
 
@@ -156,11 +157,15 @@ struct brw_winsys_screen {
 			  unsigned offset,
 			  struct brw_winsys_buffer *b2);
 
-   void (*bo_subdata)(struct brw_winsys_buffer *dst,
+   void (*bo_subdata)(struct brw_winsys_buffer *buffer,
 		      size_t offset,
 		      size_t size,
 		      const void *data);
 
+   boolean (*bo_is_busy)(struct brw_winsys_buffer *buffer);
+   boolean (*bo_references)(struct brw_winsys_buffer *a,
+			    struct brw_winsys_buffer *b);
+
    /* XXX: couldn't this be handled by returning true/false on
     * bo_emit_reloc?
     */
@@ -171,18 +176,13 @@ struct brw_winsys_screen {
    /**
     * Map a buffer.
     */
-   void *(*buffer_map)(struct brw_winsys *iws,
-                       struct brw_winsys_buffer *buffer,
-                       boolean write);
+   void *(*bo_map)(struct brw_winsys_buffer *buffer,
+		   boolean write);
 
    /**
     * Unmap a buffer.
     */
-   void (*buffer_unmap)(struct brw_winsys *iws,
-                        struct brw_winsys_buffer *buffer);
-
-   void (*buffer_destroy)(struct brw_winsys *iws,
-                          struct brw_winsys_buffer *buffer);
+   void (*bo_unmap)(struct brw_winsys_buffer *buffer);
    /*@}*/
 
 
diff --git a/src/gallium/drivers/i965/brw_wm.c b/src/gallium/drivers/i965/brw_wm.c
index 764708f7df9..3d889699f89 100644
--- a/src/gallium/drivers/i965/brw_wm.c
+++ b/src/gallium/drivers/i965/brw_wm.c
@@ -178,8 +178,8 @@ static void do_wm_prog( struct brw_context *brw,
       brw_wm_non_glsl_emit(brw, c);
    }
 
-   if (INTEL_DEBUG & DEBUG_WM)
-      fprintf(stderr, "\n");
+   if (BRW_DEBUG & DEBUG_WM)
+      debug_printf("\n");
 
    /* get the program
     */
diff --git a/src/gallium/drivers/i965/brw_wm.h b/src/gallium/drivers/i965/brw_wm.h
index bf241f5fa4d..5bc2a49c1f4 100644
--- a/src/gallium/drivers/i965/brw_wm.h
+++ b/src/gallium/drivers/i965/brw_wm.h
@@ -33,6 +33,7 @@
 #ifndef BRW_WM_H
 #define BRW_WM_H
 
+#include "tgsi/tgsi_ureg.h"
 
 #include "brw_context.h"
 #include "brw_eu.h"
@@ -57,17 +58,18 @@
 #define AA_ALWAYS    2
 
 struct brw_wm_prog_key {
+   unsigned proj_attrib_mask; /**< one bit per fragment program attribute */
+   unsigned linear_attrib_mask:1;  /**< linear interpolation vs perspective interp */
+
    GLuint source_depth_reg:3;
    GLuint aa_dest_stencil_reg:3;
    GLuint dest_depth_reg:3;
    GLuint nr_depth_regs:3;
-   GLuint computes_depth:1;	/* could be derived from program string */
+   GLuint computes_depth:1;
    GLuint source_depth_to_render_target:1;
    GLuint flat_shade:1;
-   GLuint linear_color:1;  /**< linear interpolation vs perspective interp */
    GLuint runtime_check_aads_emit:1;
-   
-   GLbitfield proj_attrib_mask; /**< one bit per fragment program attribute */
+
    GLuint shadowtex_mask:16;
    GLuint yuvtex_mask:16;
    GLuint yuvtex_swap_mask:16;	/* UV swaped */
@@ -75,7 +77,7 @@ struct brw_wm_prog_key {
    GLuint tex_swizzles[BRW_MAX_TEX_UNIT];
 
    GLuint program_string_id:32;
-   GLuint drawable_height;
+
    GLuint vp_nr_outputs_written;
 };
 
@@ -151,7 +153,7 @@ struct brw_wm_instruction {
 };
 
 
-#define BRW_WM_MAX_INSN  (MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS*3 + FRAG_ATTRIB_MAX + 3)
+#define BRW_WM_MAX_INSN  2048
 #define BRW_WM_MAX_GRF   128		/* hardware limit */
 #define BRW_WM_MAX_VREG  (BRW_WM_MAX_INSN * 4)
 #define BRW_WM_MAX_REF   (BRW_WM_MAX_INSN * 12)
@@ -161,11 +163,19 @@ struct brw_wm_instruction {
 #define BRW_WM_MAX_SUBROUTINE 16
 
 
+struct ureg_instruction {
+   unsigned opcode:8;
+   unsigned tex_target:3;
+   struct ureg_dst dst;
+   struct ureg_src src[3];
+};
+
 
 /* New opcodes to track internal operations required for WM unit.
  * These are added early so that the registers used can be tracked,
  * freed and reused like those of other instructions.
  */
+#define MAX_OPCODE        TGSI_OPCODE_LAST
 #define WM_PIXELXY        (MAX_OPCODE)
 #define WM_DELTAXY        (MAX_OPCODE + 1)
 #define WM_PIXELW         (MAX_OPCODE + 2)
@@ -177,7 +187,7 @@ struct brw_wm_instruction {
 #define WM_FRONTFACING    (MAX_OPCODE + 8)
 #define MAX_WM_OPCODE     (MAX_OPCODE + 9)
 
-#define PROGRAM_PAYLOAD   (PROGRAM_FILE_MAX)
+#define PROGRAM_PAYLOAD   (TGSI_FILE_COUNT)
 #define PAYLOAD_DEPTH     (FRAG_ATTRIB_MAX)
 
 struct brw_wm_compile {
@@ -198,15 +208,15 @@ struct brw_wm_compile {
     * simplifying and adding instructions for interpolation and
     * framebuffer writes.
     */
-   struct prog_instruction prog_instructions[BRW_WM_MAX_INSN];
+   struct ureg_instruction prog_instructions[BRW_WM_MAX_INSN];
    GLuint nr_fp_insns;
    GLuint fp_temp;
    GLuint fp_interp_emitted;
    GLuint fp_fragcolor_emitted;
 
-   struct prog_src_register pixel_xy;
-   struct prog_src_register delta_xy;
-   struct prog_src_register pixel_w;
+   struct ureg_src pixel_xy;
+   struct ureg_src delta_xy;
+   struct ureg_src pixel_w;
 
 
    struct brw_wm_value vreg[BRW_WM_MAX_VREG];
@@ -217,7 +227,7 @@ struct brw_wm_compile {
 
    struct {
       struct brw_wm_value depth[4]; /* includes r0/r1 */
-      struct brw_wm_value input_interp[FRAG_ATTRIB_MAX];
+      struct brw_wm_value input_interp[PIPE_MAX_SHADER_INPUTS];
    } payload;
 
 
@@ -295,7 +305,7 @@ void brw_wm_lookup_iz( GLuint line_aa,
 		       GLboolean ps_uses_depth,
 		       struct brw_wm_prog_key *key );
 
-GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp);
+//GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp);
 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c);
 
 void emit_ddxy(struct brw_compile *p,
diff --git a/src/gallium/drivers/i965/brw_wm_debug.c b/src/gallium/drivers/i965/brw_wm_debug.c
index c6659646f2c..04dec5ba392 100644
--- a/src/gallium/drivers/i965/brw_wm_debug.c
+++ b/src/gallium/drivers/i965/brw_wm_debug.c
@@ -41,21 +41,21 @@ void brw_wm_print_value( struct brw_wm_compile *c,
    if (c->state >= PASS2_DONE) 
       brw_print_reg(value->hw_reg);
    else if( value == &c->undef_value )
-      _mesa_printf("undef");
+      debug_printf("undef");
    else if( value - c->vreg >= 0 &&
 	    value - c->vreg < BRW_WM_MAX_VREG)
-      _mesa_printf("r%d", value - c->vreg);
+      debug_printf("r%d", value - c->vreg);
    else if (value - c->creg >= 0 &&
 	    value - c->creg < BRW_WM_MAX_PARAM)
-      _mesa_printf("c%d", value - c->creg);
+      debug_printf("c%d", value - c->creg);
    else if (value - c->payload.input_interp >= 0 &&
 	    value - c->payload.input_interp < FRAG_ATTRIB_MAX)
-      _mesa_printf("i%d", value - c->payload.input_interp);
+      debug_printf("i%d", value - c->payload.input_interp);
    else if (value - c->payload.depth >= 0 &&
 	    value - c->payload.depth < FRAG_ATTRIB_MAX)
-      _mesa_printf("d%d", value - c->payload.depth);
+      debug_printf("d%d", value - c->payload.depth);
    else 
-      _mesa_printf("?");
+      debug_printf("?");
 }
 
 void brw_wm_print_ref( struct brw_wm_compile *c,
@@ -64,16 +64,16 @@ void brw_wm_print_ref( struct brw_wm_compile *c,
    struct brw_reg hw_reg = ref->hw_reg;
 
    if (ref->unspill_reg)
-      _mesa_printf("UNSPILL(%x)/", ref->value->spill_slot);
+      debug_printf("UNSPILL(%x)/", ref->value->spill_slot);
 
    if (c->state >= PASS2_DONE)
       brw_print_reg(ref->hw_reg);
    else {
-      _mesa_printf("%s", hw_reg.negate ? "-" : "");
-      _mesa_printf("%s", hw_reg.abs ? "abs/" : "");
+      debug_printf("%s", hw_reg.negate ? "-" : "");
+      debug_printf("%s", hw_reg.abs ? "abs/" : "");
       brw_wm_print_value(c, ref->value);
       if ((hw_reg.nr&1) || hw_reg.subnr) {
-	 _mesa_printf("->%d.%d", (hw_reg.nr&1), hw_reg.subnr);
+	 debug_printf("->%d.%d", (hw_reg.nr&1), hw_reg.subnr);
       }
    }
 }
@@ -84,22 +84,22 @@ void brw_wm_print_insn( struct brw_wm_compile *c,
    GLuint i, arg;
    GLuint nr_args = brw_wm_nr_args(inst->opcode);
 
-   _mesa_printf("[");
+   debug_printf("[");
    for (i = 0; i < 4; i++) {
       if (inst->dst[i]) {
 	 brw_wm_print_value(c, inst->dst[i]);
 	 if (inst->dst[i]->spill_slot)
-	    _mesa_printf("/SPILL(%x)",inst->dst[i]->spill_slot);
+	    debug_printf("/SPILL(%x)",inst->dst[i]->spill_slot);
       }
       else
-	 _mesa_printf("#");
+	 debug_printf("#");
       if (i < 3)      
-	 _mesa_printf(",");
+	 debug_printf(",");
    }
-   _mesa_printf("]");
+   debug_printf("]");
 
    if (inst->writemask != BRW_WRITEMASK_XYZW)
-      _mesa_printf(".%s%s%s%s", 
+      debug_printf(".%s%s%s%s", 
 		   GET_BIT(inst->writemask, 0) ? "x" : "",
 		   GET_BIT(inst->writemask, 1) ? "y" : "",
 		   GET_BIT(inst->writemask, 2) ? "z" : "",
@@ -107,58 +107,58 @@ void brw_wm_print_insn( struct brw_wm_compile *c,
 
    switch (inst->opcode) {
    case WM_PIXELXY:
-      _mesa_printf(" = PIXELXY");
+      debug_printf(" = PIXELXY");
       break;
    case WM_DELTAXY:
-      _mesa_printf(" = DELTAXY");
+      debug_printf(" = DELTAXY");
       break;
    case WM_PIXELW:
-      _mesa_printf(" = PIXELW");
+      debug_printf(" = PIXELW");
       break;
    case WM_WPOSXY:
-      _mesa_printf(" = WPOSXY");
+      debug_printf(" = WPOSXY");
       break;
    case WM_PINTERP:
-      _mesa_printf(" = PINTERP");
+      debug_printf(" = PINTERP");
       break;
    case WM_LINTERP:
-      _mesa_printf(" = LINTERP");
+      debug_printf(" = LINTERP");
       break;
    case WM_CINTERP:
-      _mesa_printf(" = CINTERP");
+      debug_printf(" = CINTERP");
       break;
    case WM_FB_WRITE:
-      _mesa_printf(" = FB_WRITE");
+      debug_printf(" = FB_WRITE");
       break;
    case WM_FRONTFACING:
-      _mesa_printf(" = FRONTFACING");
+      debug_printf(" = FRONTFACING");
       break;
    default:
-      _mesa_printf(" = %s", _mesa_opcode_string(inst->opcode));
+      debug_printf(" = %s", _mesa_opcode_string(inst->opcode));
       break;
    }
 
    if (inst->saturate)
-      _mesa_printf("_SAT");
+      debug_printf("_SAT");
 
    for (arg = 0; arg < nr_args; arg++) {
 
-      _mesa_printf(" [");
+      debug_printf(" [");
 
       for (i = 0; i < 4; i++) {
 	 if (inst->src[arg][i]) {
 	    brw_wm_print_ref(c, inst->src[arg][i]);
 	 }
 	 else
-	    _mesa_printf("%%");
+	    debug_printf("%%");
 
 	 if (i < 3) 
-	    _mesa_printf(",");
+	    debug_printf(",");
 	 else
-	    _mesa_printf("]");
+	    debug_printf("]");
       }
    }
-   _mesa_printf("\n");
+   debug_printf("\n");
 }
 
 void brw_wm_print_program( struct brw_wm_compile *c,
@@ -166,9 +166,9 @@ void brw_wm_print_program( struct brw_wm_compile *c,
 {
    GLuint insn;
 
-   _mesa_printf("%s:\n", stage);
+   debug_printf("%s:\n", stage);
    for (insn = 0; insn < c->nr_insns; insn++)
       brw_wm_print_insn(c, &c->instruction[insn]);
-   _mesa_printf("\n");
+   debug_printf("\n");
 }
 
diff --git a/src/gallium/drivers/i965/brw_wm_emit.c b/src/gallium/drivers/i965/brw_wm_emit.c
index 7df9b79d7a2..5f7ae6592c6 100644
--- a/src/gallium/drivers/i965/brw_wm_emit.c
+++ b/src/gallium/drivers/i965/brw_wm_emit.c
@@ -1481,7 +1481,7 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 break;
 
       default:
-	 _mesa_printf("Unsupported opcode %i (%s) in fragment shader\n",
+	 debug_printf("Unsupported opcode %i (%s) in fragment shader\n",
 		      inst->opcode, inst->opcode < MAX_OPCODE ?
 				    _mesa_opcode_string(inst->opcode) :
 				    "unknown");
@@ -1494,12 +1494,12 @@ void brw_wm_emit( struct brw_wm_compile *c )
 		      inst->dst[i]->spill_slot);
    }
 
-   if (INTEL_DEBUG & DEBUG_WM) {
+   if (BRW_DEBUG & DEBUG_WM) {
       int i;
 
-      _mesa_printf("wm-native:\n");
+      debug_printf("wm-native:\n");
       for (i = 0; i < p->nr_insn; i++)
 	 brw_disasm(stderr, &p->store[i]);
-      _mesa_printf("\n");
+      debug_printf("\n");
    }
 }
diff --git a/src/gallium/drivers/i965/brw_wm_fp.c b/src/gallium/drivers/i965/brw_wm_fp.c
index be240031c7c..d5947307301 100644
--- a/src/gallium/drivers/i965/brw_wm_fp.c
+++ b/src/gallium/drivers/i965/brw_wm_fp.c
@@ -142,7 +142,7 @@ static struct prog_dst_register get_temp( struct brw_wm_compile *c )
    int bit = _mesa_ffs( ~c->fp_temp );
 
    if (!bit) {
-      _mesa_printf("%s: out of temporaries\n", __FILE__);
+      debug_printf("%s: out of temporaries\n", __FILE__);
       exit(1);
    }
 
@@ -977,7 +977,7 @@ static void print_insns( const struct prog_instruction *insn,
 {
    GLuint i;
    for (i = 0; i < nr; i++, insn++) {
-      _mesa_printf("%3d: ", i);
+      debug_printf("%3d: ", i);
       if (insn->Opcode < MAX_OPCODE)
 	 _mesa_print_instruction(insn);
       else if (insn->Opcode < MAX_WM_OPCODE) {
@@ -988,7 +988,7 @@ static void print_insns( const struct prog_instruction *insn,
 				     3);
       }
       else 
-	 _mesa_printf("965 Opcode %d\n", insn->Opcode);
+	 debug_printf("965 Opcode %d\n", insn->Opcode);
    }
 }
 
@@ -1002,10 +1002,10 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
    struct brw_fragment_program *fp = c->fp;
    GLuint insn;
 
-   if (INTEL_DEBUG & DEBUG_WM) {
-      _mesa_printf("pre-fp:\n");
+   if (BRW_DEBUG & DEBUG_WM) {
+      debug_printf("pre-fp:\n");
       _mesa_print_program(&fp->program.Base); 
-      _mesa_printf("\n");
+      debug_printf("\n");
    }
 
    c->pixel_xy = src_undef();
@@ -1103,10 +1103,10 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
       }
    }
 
-   if (INTEL_DEBUG & DEBUG_WM) {
-      _mesa_printf("pass_fp:\n");
+   if (BRW_DEBUG & DEBUG_WM) {
+      debug_printf("pass_fp:\n");
       print_insns( c->prog_instructions, c->nr_fp_insns );
-      _mesa_printf("\n");
+      debug_printf("\n");
    }
 }
 
diff --git a/src/gallium/drivers/i965/brw_wm_glsl.c b/src/gallium/drivers/i965/brw_wm_glsl.c
index a8de5fdd0b9..3118e615f91 100644
--- a/src/gallium/drivers/i965/brw_wm_glsl.c
+++ b/src/gallium/drivers/i965/brw_wm_glsl.c
@@ -1694,7 +1694,7 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
         c->cur_inst = i;
 
 #if 0
-        _mesa_printf("Inst %d: ", i);
+        debug_printf("Inst %d: ", i);
         _mesa_print_instruction(inst);
 #endif
 
@@ -1920,7 +1920,7 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
                }
                break;
 	    default:
-		_mesa_printf("unsupported IR in fragment shader %d\n",
+		debug_printf("unsupported IR in fragment shader %d\n",
 			inst->Opcode);
 	}
 
@@ -1931,11 +1931,11 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
     }
     post_wm_emit(c);
 
-    if (INTEL_DEBUG & DEBUG_WM) {
-      _mesa_printf("wm-native:\n");
+    if (BRW_DEBUG & DEBUG_WM) {
+      debug_printf("wm-native:\n");
       for (i = 0; i < p->nr_insn; i++)
 	 brw_disasm(stderr, &p->store[i]);
-      _mesa_printf("\n");
+      debug_printf("\n");
     }
 }
 
@@ -1945,8 +1945,8 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
  */
 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
 {
-    if (INTEL_DEBUG & DEBUG_WM) {
-        _mesa_printf("brw_wm_glsl_emit:\n");
+    if (BRW_DEBUG & DEBUG_WM) {
+        debug_printf("brw_wm_glsl_emit:\n");
     }
 
     /* initial instruction translation/simplification */
@@ -1955,7 +1955,7 @@ void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
     /* actual code generation */
     brw_wm_emit_glsl(brw, c);
 
-    if (INTEL_DEBUG & DEBUG_WM) {
+    if (BRW_DEBUG & DEBUG_WM) {
         brw_wm_print_program(c, "brw_wm_glsl_emit done");
     }
 
diff --git a/src/gallium/drivers/i965/brw_wm_pass0.c b/src/gallium/drivers/i965/brw_wm_pass0.c
index 31b0270e843..71e4c568356 100644
--- a/src/gallium/drivers/i965/brw_wm_pass0.c
+++ b/src/gallium/drivers/i965/brw_wm_pass0.c
@@ -101,7 +101,7 @@ static const struct brw_wm_ref *get_param_ref( struct brw_wm_compile *c,
    GLuint i = c->prog_data.nr_params++;
    
    if (i >= BRW_WM_MAX_PARAM) {
-      _mesa_printf("%s: out of params\n", __FUNCTION__);
+      debug_printf("%s: out of params\n", __FUNCTION__);
       c->prog_data.error = 1;
       return NULL;
    }
@@ -150,7 +150,7 @@ static const struct brw_wm_ref *get_imm_ref( struct brw_wm_compile *c,
       return c->imm_ref[i].ref;
    }
    else {
-      _mesa_printf("%s: out of imm_refs\n", __FUNCTION__);
+      debug_printf("%s: out of imm_refs\n", __FUNCTION__);
       c->prog_data.error = 1;
       return NULL;
    }
@@ -434,7 +434,7 @@ void brw_wm_pass0( struct brw_wm_compile *c )
       }
    }
  
-   if (INTEL_DEBUG & DEBUG_WM) {
+   if (BRW_DEBUG & DEBUG_WM) {
       brw_wm_print_program(c, "pass0");
    }
 }
diff --git a/src/gallium/drivers/i965/brw_wm_pass1.c b/src/gallium/drivers/i965/brw_wm_pass1.c
index f2ae3a958f9..85a3a55ca4f 100644
--- a/src/gallium/drivers/i965/brw_wm_pass1.c
+++ b/src/gallium/drivers/i965/brw_wm_pass1.c
@@ -284,7 +284,7 @@ void brw_wm_pass1( struct brw_wm_compile *c )
       track_arg(c, inst, 2, read2);
    }
 
-   if (INTEL_DEBUG & DEBUG_WM) {
+   if (BRW_DEBUG & DEBUG_WM) {
       brw_wm_print_program(c, "pass1");
    }
 }
diff --git a/src/gallium/drivers/i965/brw_wm_pass2.c b/src/gallium/drivers/i965/brw_wm_pass2.c
index 6faea018fbc..a19ca623287 100644
--- a/src/gallium/drivers/i965/brw_wm_pass2.c
+++ b/src/gallium/drivers/i965/brw_wm_pass2.c
@@ -331,13 +331,13 @@ void brw_wm_pass2( struct brw_wm_compile *c )
       }
    }
 
-   if (INTEL_DEBUG & DEBUG_WM) {
+   if (BRW_DEBUG & DEBUG_WM) {
       brw_wm_print_program(c, "pass2");
    }
 
    c->state = PASS2_DONE;
 
-   if (INTEL_DEBUG & DEBUG_WM) {
+   if (BRW_DEBUG & DEBUG_WM) {
        brw_wm_print_program(c, "pass2/done");
    }
 }
diff --git a/src/gallium/drivers/i965/brw_wm_sampler_state.c b/src/gallium/drivers/i965/brw_wm_sampler_state.c
index a8993f93120..32692d533c4 100644
--- a/src/gallium/drivers/i965/brw_wm_sampler_state.c
+++ b/src/gallium/drivers/i965/brw_wm_sampler_state.c
@@ -76,8 +76,9 @@ static GLint S_FIXED(GLfloat value, GLuint frac_bits)
 }
 
 
-static struct brw_winsys_buffer *upload_default_color( struct brw_context *brw,
-				     const GLfloat *color )
+static struct brw_winsys_buffer *
+upload_default_color( struct brw_context *brw,
+		      const GLfloat *color )
 {
    struct brw_sampler_default_color sdc;
 
@@ -117,63 +118,6 @@ static void brw_update_sampler_state(struct wm_sampler_entry *key,
 {
    _mesa_memset(sampler, 0, sizeof(*sampler));
 
-   switch (key->minfilter) {
-   case GL_NEAREST:
-      sampler->ss0.min_filter = BRW_MAPFILTER_NEAREST;
-      sampler->ss0.mip_filter = BRW_MIPFILTER_NONE;
-      break;
-   case GL_LINEAR:
-      sampler->ss0.min_filter = BRW_MAPFILTER_LINEAR;
-      sampler->ss0.mip_filter = BRW_MIPFILTER_NONE;
-      break;
-   case GL_NEAREST_MIPMAP_NEAREST:
-      sampler->ss0.min_filter = BRW_MAPFILTER_NEAREST;
-      sampler->ss0.mip_filter = BRW_MIPFILTER_NEAREST;
-      break;
-   case GL_LINEAR_MIPMAP_NEAREST:
-      sampler->ss0.min_filter = BRW_MAPFILTER_LINEAR;
-      sampler->ss0.mip_filter = BRW_MIPFILTER_NEAREST;
-      break;
-   case GL_NEAREST_MIPMAP_LINEAR:
-      sampler->ss0.min_filter = BRW_MAPFILTER_NEAREST;
-      sampler->ss0.mip_filter = BRW_MIPFILTER_LINEAR;
-      break;
-   case GL_LINEAR_MIPMAP_LINEAR:
-      sampler->ss0.min_filter = BRW_MAPFILTER_LINEAR;
-      sampler->ss0.mip_filter = BRW_MIPFILTER_LINEAR;
-      break;
-   default:
-      break;
-   }
-
-   /* Set Anisotropy: 
-    */
-   if (key->max_aniso > 1.0) {
-      sampler->ss0.min_filter = BRW_MAPFILTER_ANISOTROPIC; 
-      sampler->ss0.mag_filter = BRW_MAPFILTER_ANISOTROPIC;
-
-      if (key->max_aniso > 2.0) {
-	 sampler->ss3.max_aniso = MIN2((key->max_aniso - 2) / 2,
-				       BRW_ANISORATIO_16);
-      }
-   }
-   else {
-      switch (key->magfilter) {
-      case GL_NEAREST:
-	 sampler->ss0.mag_filter = BRW_MAPFILTER_NEAREST;
-	 break;
-      case GL_LINEAR:
-	 sampler->ss0.mag_filter = BRW_MAPFILTER_LINEAR;
-	 break;
-      default:
-	 break;
-      }  
-   }
-
-   sampler->ss1.r_wrap_mode = translate_wrap_mode(key->wrap_r);
-   sampler->ss1.s_wrap_mode = translate_wrap_mode(key->wrap_s);
-   sampler->ss1.t_wrap_mode = translate_wrap_mode(key->wrap_t);
-
    /* Cube-maps on 965 and later must use the same wrap mode for all 3
     * coordinate dimensions.  Futher, only CUBE and CLAMP are valid.
     */
@@ -198,36 +142,7 @@ static void brw_update_sampler_state(struct wm_sampler_entry *key,
    }
 
 
-   /* Set shadow function: 
-    */
-   if (key->comparemode == GL_COMPARE_R_TO_TEXTURE_ARB) {
-      /* Shadowing is "enabled" by emitting a particular sampler
-       * message (sample_c).  So need to recompile WM program when
-       * shadow comparison is enabled on each/any texture unit.
-       */
-      sampler->ss0.shadow_function =
-	 intel_translate_shadow_compare_func(key->comparefunc);
-   }
-
-   /* Set LOD bias: 
-    */
-   sampler->ss0.lod_bias = S_FIXED(CLAMP(key->lod_bias, -16, 15), 6);
-
-   sampler->ss0.lod_preclamp = 1; /* OpenGL mode */
-   sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */
-
-   /* Set BaseMipLevel, MaxLOD, MinLOD: 
-    *
-    * XXX: I don't think that using firstLevel, lastLevel works,
-    * because we always setup the surface state as if firstLevel ==
-    * level zero.  Probably have to subtract firstLevel from each of
-    * these:
-    */
-   sampler->ss0.base_level = U_FIXED(0, 1);
 
-   sampler->ss1.max_lod = U_FIXED(MIN2(MAX2(key->maxlod, 0), 13), 6);
-   sampler->ss1.min_lod = U_FIXED(MIN2(MAX2(key->minlod, 0), 13), 6);
-   
    sampler->ss2.default_color_pointer = sdc_bo->offset >> 5; /* reloc */
 }
 
@@ -237,57 +152,42 @@ static void
 brw_wm_sampler_populate_key(struct brw_context *brw,
 			    struct wm_sampler_key *key)
 {
-   int unit;
+   int nr = MIN2(brw->curr.number_textures,
+		 brw->curr.number_samplers);
+   int i;
 
    memset(key, 0, sizeof(*key));
 
-   for (unit = 0; unit < BRW_MAX_TEX_UNIT; unit++) {
-      if (ctx->Texture.Unit[unit]._ReallyEnabled) {
-	 struct wm_sampler_entry *entry = &key->sampler[unit];
-	 struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-	 struct gl_texture_object *texObj = texUnit->_Current;
-	 struct intel_texture_object *intelObj = intel_texture_object(texObj);
-	 struct gl_texture_image *firstImage =
-	    texObj->Image[0][intelObj->firstLevel];
-
-         entry->tex_target = texObj->Target;
-
-	 entry->seamless_cube_map = (texObj->Target == GL_TEXTURE_CUBE_MAP)
-	    ? ctx->Texture.CubeMapSeamless : GL_FALSE;
-
-	 entry->wrap_r = texObj->WrapR;
-	 entry->wrap_s = texObj->WrapS;
-	 entry->wrap_t = texObj->WrapT;
-
-	 entry->maxlod = texObj->MaxLod;
-	 entry->minlod = texObj->MinLod;
-	 entry->lod_bias = texUnit->LodBias + texObj->LodBias;
-	 entry->max_aniso = texObj->MaxAnisotropy;
-	 entry->minfilter = texObj->MinFilter;
-	 entry->magfilter = texObj->MagFilter;
-	 entry->comparemode = texObj->CompareMode;
-         entry->comparefunc = texObj->CompareFunc;
-
-	 brw->sws->bo_unreference(brw->wm.sdc_bo[unit]);
-	 if (firstImage->_BaseFormat == GL_DEPTH_COMPONENT) {
-	    float bordercolor[4] = {
-	       texObj->BorderColor[0],
-	       texObj->BorderColor[0],
-	       texObj->BorderColor[0],
-	       texObj->BorderColor[0]
-	    };
-	    /* GL specs that border color for depth textures is taken from the
-	     * R channel, while the hardware uses A.  Spam R into all the
-	     * channels for safety.
-	     */
-	    brw->wm.sdc_bo[unit] = upload_default_color(brw, bordercolor);
-	 } else {
-	    brw->wm.sdc_bo[unit] = upload_default_color(brw,
-							texObj->BorderColor);
-	 }
-	 key->sampler_count = unit + 1;
+   for (i = 0; i < nr; i++) {
+      const struct brw_texture *tex = brw->curr.texture[i];
+      const struct brw_sampler *sampler = brw->curr.sampler[i];
+      struct wm_sampler_entry *entry = &key->sampler[i];
+
+      entry->tex_target = texObj->Target;
+      entry->seamless_cube_map = FALSE; /* XXX: add this to gallium */
+      entry->ss0 = sampler->ss0;
+      entry->ss1 = sampler->ss1;
+      entry->ss3 = sampler->ss3;
+
+      brw->sws->bo_unreference(brw->wm.sdc_bo[i]);
+      if (firstImage->_BaseFormat == GL_DEPTH_COMPONENT) {
+	 float bordercolor[4] = {
+	    texObj->BorderColor[0],
+	    texObj->BorderColor[0],
+	    texObj->BorderColor[0],
+	    texObj->BorderColor[0]
+	 };
+	 /* GL specs that border color for depth textures is taken from the
+	  * R channel, while the hardware uses A.  Spam R into all the
+	  * channels for safety.
+	  */
+	 brw->wm.sdc_bo[i] = upload_default_color(brw, bordercolor);
+      } else {
+	 brw->wm.sdc_bo[i] = upload_default_color(brw, texObj->BorderColor);
       }
    }
+
+   key->sampler_count = nr;
 }
 
 /* All samplers must be uploaded in a single contiguous array, which
@@ -354,7 +254,7 @@ static void upload_wm_samplers( struct brw_context *brw )
 
 const struct brw_tracked_state brw_wm_samplers = {
    .dirty = {
-      .mesa = _NEW_TEXTURE,
+      .mesa = PIPE_NEW_BOUND_TEXTURES | PIPE_NEW_SAMPLER,
       .brw = 0,
       .cache = 0
    },
diff --git a/src/gallium/drivers/i965/brw_wm_state.c b/src/gallium/drivers/i965/brw_wm_state.c
index 4989aae830c..edabf6ceb6a 100644
--- a/src/gallium/drivers/i965/brw_wm_state.c
+++ b/src/gallium/drivers/i965/brw_wm_state.c
@@ -65,7 +65,7 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
 
    memset(key, 0, sizeof(*key));
 
-   if (INTEL_DEBUG & DEBUG_SINGLE_THREAD)
+   if (BRW_DEBUG & DEBUG_SINGLE_THREAD)
       key->max_threads = 1;
    else {
       /* WM maximum threads is number of EUs times number of threads per EU. */
@@ -120,7 +120,7 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
    ASSERT(bfp->isGLSL == brw_wm_is_glsl(fp));
 
    /* _NEW_QUERY */
-   key->stats_wm = intel->stats_wm;
+   key->stats_wm = (brw->query.stats_wm != 0);
 
    /* _NEW_LINE */
    key->line_stipple = ctx->Line.StippleFlag;
@@ -215,7 +215,7 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
 
    wm.wm5.line_stipple = key->line_stipple;
 
-   if (INTEL_DEBUG & DEBUG_STATS || key->stats_wm)
+   if (BRW_DEBUG & DEBUG_STATS || key->stats_wm)
       wm.wm4.stats_enable = 1;
 
    bo = brw_upload_cache(&brw->cache, BRW_WM_UNIT,
-- 
cgit v1.2.3


From 5a304995e09d8dbfd40a2dfab32eacb7e85798e3 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Mon, 26 Oct 2009 01:11:36 +0000
Subject: i965g: still working on compilation

---
 src/gallium/drivers/i965/brw_context.h          |  15 +-
 src/gallium/drivers/i965/brw_gs.c               |   2 +-
 src/gallium/drivers/i965/brw_swtnl.c            | 144 ++---
 src/gallium/drivers/i965/brw_urb.c              |   5 +-
 src/gallium/drivers/i965/brw_vs.c               |  31 +-
 src/gallium/drivers/i965/brw_vs.h               |  14 +-
 src/gallium/drivers/i965/brw_vs_emit.c          | 733 ++++++++++++------------
 src/gallium/drivers/i965/brw_wm_glsl.c          |   4 +-
 src/gallium/drivers/i965/brw_wm_surface_state.c |   7 +-
 9 files changed, 485 insertions(+), 470 deletions(-)

(limited to 'src/gallium/drivers/i965/brw_vs.h')

diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
index 10c1cf6f33a..8aaf895d20a 100644
--- a/src/gallium/drivers/i965/brw_context.h
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -154,6 +154,7 @@ struct brw_vertex_shader {
    const struct tgsi_token *tokens;
    struct tgsi_shader_info info;
 
+   unsigned id;
    struct brw_winsys_buffer *const_buffer;    /** Program constant buffer/surface */
    GLboolean use_const_buffer;
 };
@@ -165,6 +166,7 @@ struct brw_fragment_shader {
 
    GLboolean isGLSL;
 
+   unsigned id;
    struct brw_winsys_buffer *const_buffer;    /** Program constant buffer/surface */
    GLboolean use_const_buffer;
 };
@@ -280,10 +282,13 @@ struct brw_vs_prog_data {
    GLuint curb_read_length;
    GLuint urb_read_length;
    GLuint total_grf;
-   GLuint nr_outputs_written;
-   GLuint nr_params;       /**< number of float params/constants */
 
-   GLuint inputs_read;
+   GLuint nr_outputs;
+   GLuint nr_inputs;
+
+   GLuint nr_params;       /**< number of TGSI_FILE_CONSTANT's */
+
+   GLboolean copy_edgeflag;
 
    /* Used for calculating urb partitions:
     */
@@ -475,8 +480,8 @@ struct brw_context
    /* Active state from the state tracker: 
     */
    struct {
-      const struct brw_vertex_shader *vertex_shader;
-      const struct brw_fragment_shader *fragment_shader;
+      struct brw_vertex_shader *vertex_shader;
+      struct brw_fragment_shader *fragment_shader;
       const struct brw_blend_state *blend;
       const struct brw_rasterizer_state *rast;
       const struct brw_depth_stencil_state *zstencil;
diff --git a/src/gallium/drivers/i965/brw_gs.c b/src/gallium/drivers/i965/brw_gs.c
index 3ecaa74e4f9..693d8bfdf8d 100644
--- a/src/gallium/drivers/i965/brw_gs.c
+++ b/src/gallium/drivers/i965/brw_gs.c
@@ -148,7 +148,7 @@ static void populate_key( struct brw_context *brw,
    memset(key, 0, sizeof(*key));
 
    /* CACHE_NEW_VS_PROG */
-   key->nr_attrs = brw->vs.prog_data->nr_outputs_written;
+   key->nr_attrs = brw->vs.prog_data->nr_outputs;
 
    /* BRW_NEW_PRIMITIVE */
    key->primitive = gs_prim[brw->primitive];
diff --git a/src/gallium/drivers/i965/brw_swtnl.c b/src/gallium/drivers/i965/brw_swtnl.c
index d2df8af9f40..464013e7c40 100644
--- a/src/gallium/drivers/i965/brw_swtnl.c
+++ b/src/gallium/drivers/i965/brw_swtnl.c
@@ -1,111 +1,93 @@
 
-/* XXX: could split the primitive list to fallback only on the
- * non-conformant primitives.
- */
-static GLboolean check_fallbacks( struct brw_context *brw,
-				  const struct _mesa_prim *prim,
-				  GLuint nr_prims )
+#include "brw_context.h"
+#include "brw_pipe_rast.h"
+
+
+static GLboolean need_swtnl( struct brw_context *brw )
 {
-   GLuint i;
+   const struct pipe_rasterizer_state *rast = &brw->curr.rast->templ;
 
    /* If we don't require strict OpenGL conformance, never 
     * use fallbacks.  If we're forcing fallbacks, always
     * use fallfacks.
     */
    if (brw->flags.no_swtnl)
-      return GL_FALSE;
+      return FALSE;
 
    if (brw->flags.force_swtnl)
-      return GL_TRUE;
+      return TRUE;
 
-   if (brw->curr.rast->tmpl.smooth_polys) {
-      for (i = 0; i < nr_prims; i++)
-	 if (reduced_prim[prim[i].mode] == GL_TRIANGLES) 
-	    return GL_TRUE;
+   /* Exceeding hw limits on number of VS inputs?
+    */
+   if (brw->curr.num_vertex_elements == 0 ||
+       brw->curr.num_vertex_elements >= BRW_VEP_MAX) {
+      return TRUE;
    }
 
-   /* BRW hardware will do AA lines, but they are non-conformant it
-    * seems.  TBD whether we keep this fallback:
+   /* Position array with zero stride?
+    *
+    * XXX: position isn't always at zero...
+    * XXX: eliminate zero-stride arrays
     */
-   if (ctx->Line.SmoothFlag) {
-      for (i = 0; i < nr_prims; i++)
-	 if (reduced_prim[prim[i].mode] == GL_LINES) 
-	    return GL_TRUE;
+   {
+      int ve0_vb = brw->curr.vertex_element[0].vertex_buffer_index;
+      
+      if (brw->curr.vertex_buffer[ve0_vb].stride == 0)
+	 return TRUE;
    }
 
-   /* Stipple -- these fallbacks could be resolved with a little
-    * bit of work?
+   /* XXX: short-circuit
     */
-   if (ctx->Line.StippleFlag) {
-      for (i = 0; i < nr_prims; i++) {
-	 /* GS doesn't get enough information to know when to reset
-	  * the stipple counter?!?
-	  */
-	 if (prim[i].mode == GL_LINE_LOOP || prim[i].mode == GL_LINE_STRIP) 
-	    return GL_TRUE;
-	    
-	 if (prim[i].mode == GL_POLYGON &&
-	     (ctx->Polygon.FrontMode == GL_LINE ||
-	      ctx->Polygon.BackMode == GL_LINE))
-	    return GL_TRUE;
-      }
-   }
+   return FALSE;
 
-   if (ctx->Point.SmoothFlag) {
-      for (i = 0; i < nr_prims; i++)
-	 if (prim[i].mode == GL_POINTS) 
-	    return GL_TRUE;
-   }
+   if (brw->reduced_primitive == PIPE_PRIM_TRIANGLES) {
+      if (rast->poly_smooth)
+	 return TRUE;
 
-   /* BRW hardware doesn't handle GL_CLAMP texturing correctly;
-    * brw_wm_sampler_state:translate_wrap_mode() treats GL_CLAMP
-    * as GL_CLAMP_TO_EDGE instead.  If we're using GL_CLAMP, and
-    * we want strict conformance, force the fallback.
-    * Right now, we only do this for 2D textures.
-    */
+   }
+   
+   if (brw->reduced_primitive == PIPE_PRIM_LINES ||
+       (brw->reduced_primitive == PIPE_PRIM_TRIANGLES &&
+	(rast->fill_cw == PIPE_POLYGON_MODE_LINE ||
+	 rast->fill_ccw == PIPE_POLYGON_MODE_LINE)))
    {
-      int u;
-      for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
-         struct gl_texture_unit *texUnit = &ctx->Texture.Unit[u];
-         if (texUnit->Enabled) {
-            if (texUnit->Enabled & TEXTURE_1D_BIT) {
-               if (texUnit->CurrentTex[TEXTURE_1D_INDEX]->WrapS == GL_CLAMP) {
-                   return GL_TRUE;
-               }
-            }
-            if (texUnit->Enabled & TEXTURE_2D_BIT) {
-               if (texUnit->CurrentTex[TEXTURE_2D_INDEX]->WrapS == GL_CLAMP ||
-                   texUnit->CurrentTex[TEXTURE_2D_INDEX]->WrapT == GL_CLAMP) {
-                   return GL_TRUE;
-               }
-            }
-            if (texUnit->Enabled & TEXTURE_3D_BIT) {
-               if (texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapS == GL_CLAMP ||
-                   texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapT == GL_CLAMP ||
-                   texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapR == GL_CLAMP) {
-                   return GL_TRUE;
-               }
-            }
-         }
-      }
+      /* BRW hardware will do AA lines, but they are non-conformant it
+       * seems.  TBD whether we keep this fallback:
+       */
+      if (rast->line_smooth)
+	 return TRUE;
+
+      /* XXX: was a fallback in mesa (gs doesn't get enough
+       * information to know when to reset stipple counter), but there
+       * must be a way around it.
+       */
+      if (rast->line_stipple_enable &&
+	  (brw->reduced_primitive == PIPE_PRIM_TRIANGLES ||
+	   brw->primitive == PIPE_PRIM_LINE_LOOP || 
+	   brw->primitive == PIPE_PRIM_LINE_STRIP))
+	 return TRUE;
    }
 
-   /* Exceeding hw limits on number of VS inputs?
-    */
-   if (brw->nr_ve == 0 ||
-       brw->nr_ve >= BRW_VEP_MAX) {
-      return TRUE;
+   
+   if (brw->reduced_primitive == PIPE_PRIM_POINTS ||
+       (brw->reduced_primitive == PIPE_PRIM_TRIANGLES &&
+	(rast->fill_cw == PIPE_POLYGON_MODE_POINT ||
+	 rast->fill_ccw == PIPE_POLYGON_MODE_POINT)))
+   {
+      if (rast->point_smooth)
+	 return TRUE;
    }
 
-   /* Position array with zero stride?
+   /* BRW hardware doesn't handle CLAMP texturing correctly;
+    * brw_wm_sampler_state:translate_wrap_mode() treats CLAMP
+    * as CLAMP_TO_EDGE instead.  If we're using CLAMP, and
+    * we want strict conformance, force the fallback.
+    *
+    * XXX: need a workaround for this.
     */
-   if (brw->vs[brw->ve[0]]->stride == 0)
-      return TRUE;
-
-
       
    /* Nothing stopping us from the fast path now */
-   return GL_FALSE;
+   return FALSE;
 }
 
 
diff --git a/src/gallium/drivers/i965/brw_urb.c b/src/gallium/drivers/i965/brw_urb.c
index ff2466528df..57fd8f20b20 100644
--- a/src/gallium/drivers/i965/brw_urb.c
+++ b/src/gallium/drivers/i965/brw_urb.c
@@ -35,6 +35,7 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
+#include "brw_debug.h"
 
 #define VS 0
 #define GS 1
@@ -111,7 +112,7 @@ static GLboolean check_urb_layout( struct brw_context *brw )
 /* Most minimal update, forces re-emit of URB fence packet after GS
  * unit turned on/off.
  */
-static void recalculate_urb_fence( struct brw_context *brw )
+static int recalculate_urb_fence( struct brw_context *brw )
 {
    GLuint csize = brw->curbe.total_size;
    GLuint vsize = brw->vs.prog_data->urb_entry_size;
@@ -204,6 +205,8 @@ done:
       
       brw->state.dirty.brw |= BRW_NEW_URB_FENCE;
    }
+
+   return 0;
 }
 
 
diff --git a/src/gallium/drivers/i965/brw_vs.c b/src/gallium/drivers/i965/brw_vs.c
index 010ac115d36..3965ca6c54e 100644
--- a/src/gallium/drivers/i965/brw_vs.c
+++ b/src/gallium/drivers/i965/brw_vs.c
@@ -28,17 +28,19 @@
   * Authors:
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
-           
+
+#include "tgsi/tgsi_dump.h"           
 
 #include "brw_context.h"
 #include "brw_vs.h"
 #include "brw_util.h"
 #include "brw_state.h"
+#include "brw_pipe_rast.h"
 
 
 static void do_vs_prog( struct brw_context *brw, 
-			struct brw_vertex_program *vp,
+			struct brw_vertex_shader *vp,
 			struct brw_vs_prog_key *key )
 {
    GLuint program_size;
@@ -51,16 +53,12 @@ static void do_vs_prog( struct brw_context *brw,
    brw_init_compile(brw, &c.func);
    c.vp = vp;
 
-   c.prog_data.nr_outputs_written = vp->program.Base.OutputsWritten;
-   c.prog_data.inputs_read = vp->program.Base.InputsRead;
-
-   if (c.key.copy_edgeflag) {
-      c.prog_data.nr_outputs_written |= 1<<VERT_RESULT_EDGE;
-      c.prog_data.inputs_read |= 1<<VERT_ATTRIB_EDGEFLAG;
-   }
+   c.prog_data.nr_outputs = vp->info.num_outputs;
+   c.prog_data.nr_inputs = vp->info.num_inputs;
+   c.prog_data.copy_edgeflag = c.key.copy_edgeflag;
 
    if (0)
-      tgsi_dump(&c.vp->tokens, 0);
+      tgsi_dump(c.vp->tokens, 0);
 
    /* Emit GEN4 code.
     */
@@ -80,11 +78,10 @@ static void do_vs_prog( struct brw_context *brw,
 }
 
 
-static void brw_upload_vs_prog(struct brw_context *brw)
+static int brw_upload_vs_prog(struct brw_context *brw)
 {
    struct brw_vs_prog_key key;
-   struct brw_vertex_program *vp = 
-      (struct brw_vertex_program *)brw->vertex_program;
+   struct brw_vertex_shader *vp = brw->curr.vertex_shader;
 
    memset(&key, 0, sizeof(key));
 
@@ -92,9 +89,9 @@ static void brw_upload_vs_prog(struct brw_context *brw)
     * the inputs it asks for, whether they are varying or not.
     */
    key.program_string_id = vp->id;
-   key.nr_userclip = brw->nr_userclip;
-   key.copy_edgeflag = (brw->rast->fill_ccw != PIPE_POLYGON_MODE_FILL ||
-			brw->rast->fill_cw != PIPE_POLYGON_MODE_FILL);
+   key.nr_userclip = brw->curr.ucp.nr;
+   key.copy_edgeflag = (brw->curr.rast->templ.fill_ccw != PIPE_POLYGON_MODE_FILL ||
+			brw->curr.rast->templ.fill_cw != PIPE_POLYGON_MODE_FILL);
 
    /* Make an early check for the key.
     */
@@ -105,6 +102,8 @@ static void brw_upload_vs_prog(struct brw_context *brw)
 				      &brw->vs.prog_data);
    if (brw->vs.prog_bo == NULL)
       do_vs_prog(brw, vp, &key);
+
+   return 0;
 }
 
 
diff --git a/src/gallium/drivers/i965/brw_vs.h b/src/gallium/drivers/i965/brw_vs.h
index e33fa2f0aad..58119567dcf 100644
--- a/src/gallium/drivers/i965/brw_vs.h
+++ b/src/gallium/drivers/i965/brw_vs.h
@@ -46,17 +46,22 @@ struct brw_vs_prog_key {
 };
 
 
+
+#define MAX_IF_DEPTH 32
+#define MAX_LOOP_DEPTH 32
+
 struct brw_vs_compile {
    struct brw_compile func;
    struct brw_vs_prog_key key;
    struct brw_vs_prog_data prog_data;
 
-   struct brw_vertex_program *vp;
+   struct brw_vertex_shader *vp;
 
    GLuint nr_inputs;
+   GLuint nr_outputs;
+   GLboolean copy_edgeflag;
 
    GLuint first_output;
-   GLuint nr_outputs;
    GLuint first_overflow_output; /**< VERT_ATTRIB_x */
 
    GLuint first_tmp;
@@ -80,8 +85,13 @@ struct brw_vs_compile {
       GLint index;
       struct brw_reg reg;
    } current_const[3];
+
+   struct brw_instruction *if_inst[MAX_IF_DEPTH];
+   struct brw_instruction *loop_inst[MAX_LOOP_DEPTH];
+
 };
 
+
 void brw_vs_emit( struct brw_vs_compile *c );
 
 #endif
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
index 04132a167bc..4daa98b29e4 100644
--- a/src/gallium/drivers/i965/brw_vs_emit.c
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -28,11 +28,25 @@
   * Authors:
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
-            
 
 #include "pipe/p_shader_tokens.h"
+            
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "tgsi/tgsi_ureg.h"
+
 #include "brw_context.h"
 #include "brw_vs.h"
+#include "brw_debug.h"
+
+
+struct ureg_instruction {
+   unsigned opcode:8;
+   unsigned tex_target:3;
+   struct ureg_dst dst;
+   struct ureg_src src[3];
+};
 
 
 static struct brw_reg get_tmp( struct brw_vs_compile *c )
@@ -72,8 +86,8 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
     * works if everything fits in the GRF.
     * XXX this heuristic/check may need some fine tuning...
     */
-   if (c->vp->program.Base.Parameters->NumParameters +
-       c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
+   if (c->vp->info.file_max[TGSI_FILE_CONSTANT] +
+       c->vp->info.file_max[TGSI_FILE_TEMPORARY] + 21 > BRW_MAX_GRF)
       c->vp->use_const_buffer = GL_TRUE;
    else
       c->vp->use_const_buffer = GL_FALSE;
@@ -106,25 +120,21 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    }
    else {
       /* use a section of the GRF for constants */
-      GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
+      GLuint nr_params = c->vp->info.file_max[TGSI_FILE_CONSTANT] + 1;
       for (i = 0; i < nr_params; i++) {
-         c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
+         c->regs[TGSI_FILE_CONSTANT][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
       }
       reg += (nr_params + 1) / 2;
       c->prog_data.curb_read_length = reg - 1;
-
       c->prog_data.nr_params = nr_params * 4;
    }
 
    /* Allocate input regs:  
     */
-   c->nr_inputs = 0;
-   for (i = 0; i < VERT_ATTRIB_MAX; i++) {
-      if (c->prog_data.inputs_read & (1 << i)) {
-	 c->nr_inputs++;
-	 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
-	 reg++;
-      }
+   c->nr_inputs = c->vp->info.num_inputs;
+   for (i = 0; i < c->nr_inputs; i++) {
+      c->regs[TGSI_FILE_INPUT][i] = brw_vec8_grf(reg, 0);
+      reg++;
    }
 
    /* If there are no inputs, we'll still be reading one attribute's worth
@@ -144,45 +154,51 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    else
       mrf = 4;
 
-   for (i = 0; i < c->prog_data.nr_outputs_written; i++) {
-      c->nr_outputs++;
-      assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
-      if (i == VERT_RESULT_HPOS) {
-	 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
+   /* XXX: need to access vertex output semantics here:
+    */
+   c->nr_outputs = c->prog_data.nr_outputs;
+   for (i = 0; i < c->prog_data.nr_outputs; i++) {
+      assert(i < Elements(c->regs[TGSI_FILE_OUTPUT]));
+
+      /* XXX: Hardwire position to zero:
+       */
+      if (i == 0) {
+	 c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
 	 reg++;
       }
-      else if (i == VERT_RESULT_PSIZ) {
-	 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
+      /* XXX: disable psiz:
+       */
+      else if (0) {
+	 c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
 	 reg++;
 	 mrf++;		/* just a placeholder?  XXX fix later stages & remove this */
       }
+      else if (mrf < 16) {
+	 c->regs[TGSI_FILE_OUTPUT][i] = brw_message_reg(mrf);
+	 mrf++;
+      }
       else {
-	 if (mrf < 16) {
-	    c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
-	    mrf++;
-	 }
-	 else {
-	    /* too many vertex results to fit in MRF, use GRF for overflow */
-	    if (!c->first_overflow_output)
-	       c->first_overflow_output = i;
-	    c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
-	    reg++;
-	 }
+	 /* too many vertex results to fit in MRF, use GRF for overflow */
+	 if (!c->first_overflow_output)
+	    c->first_overflow_output = i;
+	 c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
+	 reg++;
       }
    }     
 
    /* Allocate program temporaries:
     */
-   for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
-      c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
+   
+   for (i = 0; i < c->vp->info.file_max[TGSI_FILE_TEMPORARY]+1; i++) {
+      c->regs[TGSI_FILE_TEMPORARY][i] = brw_vec8_grf(reg, 0);
       reg++;
    }
 
    /* Address reg(s).  Don't try to use the internal address reg until
     * deref time.
     */
-   for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
-      c->regs[PROGRAM_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
+   for (i = 0; i < c->vp->info.file_max[TGSI_FILE_ADDRESS]+1; i++) {
+      c->regs[TGSI_FILE_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
 					     reg,
 					     0,
 					     BRW_REGISTER_TYPE_D,
@@ -243,8 +259,10 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    c->prog_data.total_grf = reg;
 
    if (BRW_DEBUG & DEBUG_VS) {
-      debug_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
-      debug_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
+      debug_printf("%s NumAddrRegs %d\n", __FUNCTION__, 
+		   c->vp->info.file_max[TGSI_FILE_ADDRESS]+1);
+      debug_printf("%s NumTemps %d\n", __FUNCTION__,
+		   c->vp->info.file_max[TGSI_FILE_TEMPORARY]+1);
       debug_printf("%s reg = %d\n", __FUNCTION__, reg);
    }
 }
@@ -740,25 +758,25 @@ static void emit_nrm( struct brw_vs_compile *c,
 
 static struct brw_reg
 get_constant(struct brw_vs_compile *c,
-             const struct prog_instruction *inst,
+             const struct ureg_instruction *inst,
              GLuint argIndex)
 {
-   const struct prog_src_register *src = &inst->SrcReg[argIndex];
+   const struct ureg_src src = inst->src[argIndex];
    struct brw_compile *p = &c->func;
    struct brw_reg const_reg;
    struct brw_reg const2_reg;
-   const GLboolean relAddr = src->RelAddr;
+   const GLboolean relAddr = src.Indirect;
 
    assert(argIndex < 3);
 
-   if (c->current_const[argIndex].index != src->Index || relAddr) {
-      struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
+   if (c->current_const[argIndex].index != src.Index || relAddr) {
+      struct brw_reg addrReg = c->regs[TGSI_FILE_ADDRESS][0];
 
-      c->current_const[argIndex].index = src->Index;
+      c->current_const[argIndex].index = src.Index;
 
 #if 0
       printf("  fetch const[%d] for arg %d into reg %d\n",
-             src->Index, argIndex, c->current_const[argIndex].reg.nr);
+             src.Index, argIndex, c->current_const[argIndex].reg.nr);
 #endif
       /* need to fetch the constant now */
       brw_dp_READ_4_vs(p,
@@ -766,7 +784,7 @@ get_constant(struct brw_vs_compile *c,
                        0,                             /* oword */
                        relAddr,                       /* relative indexing? */
                        addrReg,                       /* address register */
-                       16 * src->Index,               /* byte offset */
+                       16 * src.Index,               /* byte offset */
                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
                        );
 
@@ -783,7 +801,7 @@ get_constant(struct brw_vs_compile *c,
                           1,                       /* oword */
                           relAddr,                 /* relative indexing? */
                           addrReg,                 /* address register */
-                          16 * src->Index,         /* byte offset */
+                          16 * src.Index,         /* byte offset */
                           SURF_INDEX_VERT_CONST_BUFFER
                           );
       }
@@ -813,30 +831,24 @@ get_constant(struct brw_vs_compile *c,
 /* TODO: relative addressing!
  */
 static struct brw_reg get_reg( struct brw_vs_compile *c,
-			       gl_register_file file,
+			       enum tgsi_file_type file,
 			       GLuint index )
 {
    switch (file) {
-   case PROGRAM_TEMPORARY:
-   case PROGRAM_INPUT:
-   case PROGRAM_OUTPUT:
+   case TGSI_FILE_TEMPORARY:
+   case TGSI_FILE_INPUT:
+   case TGSI_FILE_OUTPUT:
+   case TGSI_FILE_CONSTANT:
       assert(c->regs[file][index].nr != 0);
       return c->regs[file][index];
-   case PROGRAM_STATE_VAR:
-   case PROGRAM_CONSTANT:
-   case PROGRAM_UNIFORM:
-      assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
-      return c->regs[PROGRAM_STATE_VAR][index];
-   case PROGRAM_ADDRESS:
+
+   case TGSI_FILE_ADDRESS:
       assert(index == 0);
       return c->regs[file][index];
 
-   case PROGRAM_UNDEFINED:			/* undef values */
+   case TGSI_FILE_NULL:			/* undef values */
       return brw_null_reg();
 
-   case PROGRAM_LOCAL_PARAM: 
-   case PROGRAM_ENV_PARAM: 
-   case PROGRAM_WRITE_ONLY:
    default:
       assert(0);
       return brw_null_reg();
@@ -853,7 +865,7 @@ static struct brw_reg deref( struct brw_vs_compile *c,
 {
    struct brw_compile *p = &c->func;
    struct brw_reg tmp = vec4(get_tmp(c));
-   struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
+   struct brw_reg addr_reg = c->regs[TGSI_FILE_ADDRESS][0];
    struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
    struct brw_reg indirect = brw_vec4_indirect(0,0);
@@ -886,17 +898,17 @@ static struct brw_reg deref( struct brw_vs_compile *c,
  */
 static struct brw_reg
 get_src_reg( struct brw_vs_compile *c,
-             const struct prog_instruction *inst,
+             const struct ureg_instruction *inst,
              GLuint argIndex )
 {
-   const GLuint file = inst->SrcReg[argIndex].File;
-   const GLint index = inst->SrcReg[argIndex].Index;
-   const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
+   const GLuint file = inst->src[argIndex].File;
+   const GLint index = inst->src[argIndex].Index;
+   const GLboolean relAddr = inst->src[argIndex].Indirect;
 
    switch (file) {
-   case PROGRAM_TEMPORARY:
-   case PROGRAM_INPUT:
-   case PROGRAM_OUTPUT:
+   case TGSI_FILE_TEMPORARY:
+   case TGSI_FILE_INPUT:
+   case TGSI_FILE_OUTPUT:
       if (relAddr) {
          return deref(c, c->regs[file][0], index);
       }
@@ -905,30 +917,25 @@ get_src_reg( struct brw_vs_compile *c,
          return c->regs[file][index];
       }
 
-   case PROGRAM_STATE_VAR:
-   case PROGRAM_CONSTANT:
-   case PROGRAM_UNIFORM:
-   case PROGRAM_ENV_PARAM:
+   case TGSI_FILE_CONSTANT:
       if (c->vp->use_const_buffer) {
          return get_constant(c, inst, argIndex);
       }
       else if (relAddr) {
-         return deref(c, c->regs[PROGRAM_STATE_VAR][0], index);
+         return deref(c, c->regs[TGSI_FILE_CONSTANT][0], index);
       }
       else {
-         assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
-         return c->regs[PROGRAM_STATE_VAR][index];
+         assert(c->regs[TGSI_FILE_CONSTANT][index].nr != 0);
+         return c->regs[TGSI_FILE_CONSTANT][index];
       }
-   case PROGRAM_ADDRESS:
+   case TGSI_FILE_ADDRESS:
       assert(index == 0);
       return c->regs[file][index];
 
-   case PROGRAM_UNDEFINED:
+   case TGSI_FILE_NULL:
       /* this is a normal case since we loop over all three src args */
       return brw_null_reg();
 
-   case PROGRAM_LOCAL_PARAM: 
-   case PROGRAM_WRITE_ONLY:
    default:
       assert(0);
       return brw_null_reg();
@@ -959,27 +966,27 @@ static void emit_arl( struct brw_vs_compile *c,
  * Return the brw reg for the given instruction's src argument.
  */
 static struct brw_reg get_arg( struct brw_vs_compile *c,
-                               const struct prog_instruction *inst,
+                               const struct ureg_instruction *inst,
                                GLuint argIndex )
 {
-   const struct prog_src_register *src = &inst->SrcReg[argIndex];
+   const struct ureg_src src = inst->src[argIndex];
    struct brw_reg reg;
 
-   if (src->File == PROGRAM_UNDEFINED)
+   if (src.File == TGSI_FILE_NULL)
       return brw_null_reg();
 
    reg = get_src_reg(c, inst, argIndex);
 
    /* Convert 3-bit swizzle to 2-bit.  
     */
-   reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
-				       GET_SWZ(src->Swizzle, 1),
-				       GET_SWZ(src->Swizzle, 2),
-				       GET_SWZ(src->Swizzle, 3));
+   reg.dw1.bits.swizzle = BRW_SWIZZLE4(src.SwizzleX,
+				       src.SwizzleY,
+				       src.SwizzleZ,
+				       src.SwizzleW);
 
    /* Note this is ok for non-swizzle instructions: 
     */
-   reg.negate = src->Negate ? 1 : 0;   
+   reg.negate = src.Negate ? 1 : 0;   
 
    return reg;
 }
@@ -989,21 +996,21 @@ static struct brw_reg get_arg( struct brw_vs_compile *c,
  * Get brw register for the given program dest register.
  */
 static struct brw_reg get_dst( struct brw_vs_compile *c,
-			       struct prog_dst_register dst )
+			       struct ureg_dst dst )
 {
    struct brw_reg reg;
 
    switch (dst.File) {
-   case PROGRAM_TEMPORARY:
-   case PROGRAM_OUTPUT:
+   case TGSI_FILE_TEMPORARY:
+   case TGSI_FILE_OUTPUT:
       assert(c->regs[dst.File][dst.Index].nr != 0);
       reg = c->regs[dst.File][dst.Index];
       break;
-   case PROGRAM_ADDRESS:
+   case TGSI_FILE_ADDRESS:
       assert(dst.Index == 0);
       reg = c->regs[dst.File][dst.Index];
       break;
-   case PROGRAM_UNDEFINED:
+   case TGSI_FILE_NULL:
       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
       reg = brw_null_reg();
       break;
@@ -1027,15 +1034,16 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 {
    struct brw_compile *p = &c->func;
    struct brw_reg m0 = brw_message_reg(0);
-   struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
+   struct brw_reg pos = c->regs[TGSI_FILE_OUTPUT][VERT_RESULT_HPOS];
    struct brw_reg ndc;
    int eot;
    GLuint len_vertext_header = 2;
 
    if (c->key.copy_edgeflag) {
+      assert(0);
       brw_MOV(p, 
-	      get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
-	      get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
+	      get_reg(c, TGSI_FILE_OUTPUT, 0),
+	      get_reg(c, TGSI_FILE_INPUT, 0));
    }
 
    /* Build ndc coords */
@@ -1060,7 +1068,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
       brw_set_access_mode(p, BRW_ALIGN_16);	
 
       if (c->prog_data.writes_psiz) {
-	 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
+	 struct brw_reg psiz = c->regs[TGSI_FILE_OUTPUT][VERT_RESULT_PSIZ];
 	 brw_MUL(p, brw_writemask(header1, BRW_WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
 	 brw_AND(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
       }
@@ -1138,7 +1146,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 		 eot, 		/* writes complete */
 		 0, 		/* urb destination offset */
 		 BRW_URB_SWIZZLE_INTERLEAVE);
-!
+
    if (c->first_overflow_output > 0) {
       /* Not all of the vertex outputs/results fit into the MRF.
        * Move the overflowed attributes from the GRF to the MRF and
@@ -1148,9 +1156,9 @@ static void emit_vertex_write( struct brw_vs_compile *c)
        * at mrf[4] atm...
        */
       GLuint i, mrf = 0;
-      for (i = c->first_overflow_output; i < c->prog_data.nr_outputs_written; i++) {
+      for (i = c->first_overflow_output; i < c->prog_data.nr_outputs; i++) {
 	 /* move from GRF to MRF */
-	 brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
+	 brw_MOV(p, brw_message_reg(4+mrf), c->regs[TGSI_FILE_OUTPUT][i]);
 	 mrf++;
       }
 
@@ -1195,9 +1203,9 @@ post_vs_emit( struct brw_vs_compile *c,
 }
 
 static uint32_t
-get_predicate(const struct prog_instruction *inst)
+get_predicate(const struct ureg_instruction *inst)
 {
-   if (inst->DstReg.CondMask == COND_TR)
+   if (inst->dst.CondMask == COND_TR)
       return BRW_PREDICATE_NONE;
 
    /* All of GLSL only produces predicates for COND_NE and one channel per
@@ -1213,9 +1221,9 @@ get_predicate(const struct prog_instruction *inst)
     * predicate on that.  We can probably support this, but it won't
     * necessarily be easy.
     */
-   assert(inst->DstReg.CondMask == COND_NE);
+/*   assert(inst->dst.CondMask == COND_NE); */
 
-   switch (inst->DstReg.CondSwizzle) {
+   switch (inst->dst.CondSwizzle) {
    case SWIZZLE_XXXX:
       return BRW_PREDICATE_ALIGN16_REPLICATE_X;
    case SWIZZLE_YYYY:
@@ -1225,26 +1233,281 @@ get_predicate(const struct prog_instruction *inst)
    case SWIZZLE_WWWW:
       return BRW_PREDICATE_ALIGN16_REPLICATE_W;
    default:
-      _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
-		    inst->DstReg.CondMask);
+      debug_printf("Unexpected predicate: 0x%08x\n",
+		    inst->dst.CondMask);
       return BRW_PREDICATE_NORMAL;
    }
 }
 
+static void emit_insn(struct brw_vs_compile *c,
+		      const struct tgsi_full_instruction *insn)
+{
+   struct brw_reg args[3], dst;
+   GLuint i;
+
+#if 0
+   printf("%d: ", insn);
+   _mesa_print_instruction(inst);
+#endif
+
+   /* Get argument regs.
+    */
+   for (i = 0; i < 3; i++) {
+      const struct ureg_src src = inst->src[i];
+      index = src.Index;
+      file = src.File;	
+      args[i] = get_arg(c, inst, i);
+   }
+
+   /* Get dest regs.  Note that it is possible for a reg to be both
+    * dst and arg, given the static allocation of registers.  So
+    * care needs to be taken emitting multi-operation instructions.
+    */ 
+   index = inst->dst.Index;
+   file = inst->dst.File;
+   dst = get_dst(c, inst->dst);
+
+   if (inst->SaturateMode != SATURATE_OFF) {
+      debug_printf("Unsupported saturate %d in vertex shader",
+		   inst->SaturateMode);
+   }
+
+   switch (inst->Opcode) {
+   case TGSI_OPCODE_ABS:
+      brw_MOV(p, dst, brw_abs(args[0]));
+      break;
+   case TGSI_OPCODE_ADD:
+      brw_ADD(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_COS:
+      emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_DP3:
+      brw_DP3(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_DP4:
+      brw_DP4(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_DPH:
+      brw_DPH(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_NRM3:
+      emit_nrm(c, dst, args[0], 3);
+      break;
+   case TGSI_OPCODE_NRM4:
+      emit_nrm(c, dst, args[0], 4);
+      break;
+   case TGSI_OPCODE_DST:
+      unalias2(c, dst, args[0], args[1], emit_dst_noalias); 
+      break;
+   case TGSI_OPCODE_EXP:
+      unalias1(c, dst, args[0], emit_exp_noalias);
+      break;
+   case TGSI_OPCODE_EX2:
+      emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_ARL:
+      emit_arl(c, dst, args[0]);
+      break;
+   case TGSI_OPCODE_FLR:
+      brw_RNDD(p, dst, args[0]);
+      break;
+   case TGSI_OPCODE_FRC:
+      brw_FRC(p, dst, args[0]);
+      break;
+   case TGSI_OPCODE_LOG:
+      unalias1(c, dst, args[0], emit_log_noalias);
+      break;
+   case TGSI_OPCODE_LG2:
+      emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_LIT:
+      unalias1(c, dst, args[0], emit_lit_noalias);
+      break;
+   case TGSI_OPCODE_LRP:
+      unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
+      break;
+   case TGSI_OPCODE_MAD:
+      brw_MOV(p, brw_acc_reg(), args[2]);
+      brw_MAC(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_MAX:
+      emit_max(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_MIN:
+      emit_min(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_MOV:
+      brw_MOV(p, dst, args[0]);
+      break;
+   case TGSI_OPCODE_MUL:
+      brw_MUL(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_POW:
+      emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL); 
+      break;
+   case TGSI_OPCODE_RCP:
+      emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_RSQ:
+      emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_SEQ:
+      emit_seq(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SIN:
+      emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_SNE:
+      emit_sne(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SGE:
+      emit_sge(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SGT:
+      emit_sgt(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SLT:
+      emit_slt(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SLE:
+      emit_sle(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SUB:
+      brw_ADD(p, dst, args[0], negate(args[1]));
+      break;
+   case TGSI_OPCODE_TRUNC:
+      /* round toward zero */
+      brw_RNDZ(p, dst, args[0]);
+      break;
+   case TGSI_OPCODE_XPD:
+      emit_xpd(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_IF:
+      assert(if_depth < MAX_IF_DEPTH);
+      if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
+      /* Note that brw_IF smashes the predicate_control field. */
+      if_inst[if_depth]->header.predicate_control = get_predicate(inst);
+      if_depth++;
+      break;
+   case TGSI_OPCODE_ELSE:
+      if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
+      break;
+   case TGSI_OPCODE_ENDIF:
+      assert(if_depth > 0);
+      brw_ENDIF(p, if_inst[--if_depth]);
+      break;			
+   case TGSI_OPCODE_BGNLOOP:
+      loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
+      break;
+   case TGSI_OPCODE_BRK:
+      brw_set_predicate_control(p, get_predicate(inst));
+      brw_BREAK(p);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      break;
+   case TGSI_OPCODE_CONT:
+      brw_set_predicate_control(p, get_predicate(inst));
+      brw_CONT(p);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      break;
+   case TGSI_OPCODE_ENDLOOP: 
+   {
+      struct brw_instruction *inst0, *inst1;
+      GLuint br = 1;
+
+      loop_depth--;
+
+      if (BRW_IS_IGDNG(brw))
+	 br = 2;
+
+      inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
+      /* patch all the BREAK/CONT instructions from last BEGINLOOP */
+      while (inst0 > loop_inst[loop_depth]) {
+	 inst0--;
+	 if (inst0->header.opcode == TGSI_OPCODE_BRK) {
+	    inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
+	    inst0->bits3.if_else.pop_count = 0;
+	 }
+	 else if (inst0->header.opcode == TGSI_OPCODE_CONT) {
+	    inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
+	    inst0->bits3.if_else.pop_count = 0;
+	 }
+      }
+   }
+   break;
+   case TGSI_OPCODE_BRA:
+      brw_set_predicate_control(p, get_predicate(inst));
+      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      break;
+   case TGSI_OPCODE_CAL:
+      brw_set_access_mode(p, BRW_ALIGN_1);
+      brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
+      brw_set_access_mode(p, BRW_ALIGN_16);
+      brw_ADD(p, get_addr_reg(stack_index),
+	      get_addr_reg(stack_index), brw_imm_d(4));
+      brw_save_call(p, inst->Comment, p->nr_insn);
+      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+      break;
+   case TGSI_OPCODE_RET:
+      brw_ADD(p, get_addr_reg(stack_index),
+	      get_addr_reg(stack_index), brw_imm_d(-4));
+      brw_set_access_mode(p, BRW_ALIGN_1);
+      brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
+      brw_set_access_mode(p, BRW_ALIGN_16);
+      break;
+   case TGSI_OPCODE_END:	
+      end_offset = p->nr_insn;
+      /* this instruction will get patched later to jump past subroutine
+       * code, etc.
+       */
+      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+      break;
+   case TGSI_OPCODE_PRINT:
+      /* no-op */
+      break;
+   case TGSI_OPCODE_BGNSUB:
+      brw_save_label(p, inst->Comment, p->nr_insn);
+      break;
+   case TGSI_OPCODE_ENDSUB:
+      /* no-op */
+      break;
+   default:
+      debug_printf("Unsupported opcode %i (%s) in vertex shader",
+		   inst->Opcode, inst->Opcode < MAX_OPCODE ?
+		   _mesa_opcode_string(inst->Opcode) :
+		   "unknown");
+   }
+
+   /* Set the predication update on the last instruction of the native
+    * instruction sequence.
+    *
+    * This would be problematic if it was set on a math instruction,
+    * but that shouldn't be the case with the current GLSL compiler.
+    */
+   if (inst->CondUpdate) {
+      struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
+
+      assert(hw_insn->header.destreg__conditionalmod == 0);
+      hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
+   }
+
+   release_tmps(c);
+}
+
+
 /* Emit the vertex program instructions here.
  */
 void brw_vs_emit(struct brw_vs_compile *c )
 {
-#define MAX_IF_DEPTH 32
-#define MAX_LOOP_DEPTH 32
    struct brw_compile *p = &c->func;
    struct brw_context *brw = p->brw;
-   const GLuint nr_insns = c->vp->program.Base.NumInstructions;
    GLuint insn, if_depth = 0, loop_depth = 0;
    GLuint end_offset = 0;
    struct brw_instruction *end_inst, *last_inst;
-   struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
    const struct brw_indirect stack_index = brw_indirect(0, 0);   
+   struct tgsi_parse_context parse;
+   struct tgsi_full_declaration *decl;
    GLuint index;
    GLuint file;
 
@@ -1264,258 +1527,8 @@ void brw_vs_emit(struct brw_vs_compile *c )
 
    for (insn = 0; insn < nr_insns; insn++) {
 
-      const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
-      struct brw_reg args[3], dst;
-      GLuint i;
+      const struct ureg_instruction *inst = &c->vp->program.Base.Instructions[insn];
       
-#if 0
-      printf("%d: ", insn);
-      _mesa_print_instruction(inst);
-#endif
-
-      /* Get argument regs.
-       */
-      for (i = 0; i < 3; i++) {
-	 const struct prog_src_register *src = &inst->SrcReg[i];
-	 index = src->Index;
-	 file = src->File;	
-	 args[i] = get_arg(c, inst, i);
-      }
-
-      /* Get dest regs.  Note that it is possible for a reg to be both
-       * dst and arg, given the static allocation of registers.  So
-       * care needs to be taken emitting multi-operation instructions.
-       */ 
-      index = inst->DstReg.Index;
-      file = inst->DstReg.File;
-      dst = get_dst(c, inst->DstReg);
-
-      if (inst->SaturateMode != SATURATE_OFF) {
-	 _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
-                       inst->SaturateMode);
-      }
-
-      switch (inst->Opcode) {
-      case TGSI_OPCODE_ABS:
-	 brw_MOV(p, dst, brw_abs(args[0]));
-	 break;
-      case TGSI_OPCODE_ADD:
-	 brw_ADD(p, dst, args[0], args[1]);
-	 break;
-      case TGSI_OPCODE_COS:
-	 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
-	 break;
-      case TGSI_OPCODE_DP3:
-	 brw_DP3(p, dst, args[0], args[1]);
-	 break;
-      case TGSI_OPCODE_DP4:
-	 brw_DP4(p, dst, args[0], args[1]);
-	 break;
-      case TGSI_OPCODE_DPH:
-	 brw_DPH(p, dst, args[0], args[1]);
-	 break;
-      case TGSI_OPCODE_NRM3:
-	 emit_nrm(c, dst, args[0], 3);
-	 break;
-      case TGSI_OPCODE_NRM4:
-	 emit_nrm(c, dst, args[0], 4);
-	 break;
-      case TGSI_OPCODE_DST:
-	 unalias2(c, dst, args[0], args[1], emit_dst_noalias); 
-	 break;
-      case TGSI_OPCODE_EXP:
-	 unalias1(c, dst, args[0], emit_exp_noalias);
-	 break;
-      case TGSI_OPCODE_EX2:
-	 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
-	 break;
-      case TGSI_OPCODE_ARL:
-	 emit_arl(c, dst, args[0]);
-	 break;
-      case TGSI_OPCODE_FLR:
-	 brw_RNDD(p, dst, args[0]);
-	 break;
-      case TGSI_OPCODE_FRC:
-	 brw_FRC(p, dst, args[0]);
-	 break;
-      case TGSI_OPCODE_LOG:
-	 unalias1(c, dst, args[0], emit_log_noalias);
-	 break;
-      case TGSI_OPCODE_LG2:
-	 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
-	 break;
-      case TGSI_OPCODE_LIT:
-	 unalias1(c, dst, args[0], emit_lit_noalias);
-	 break;
-      case TGSI_OPCODE_LRP:
-	 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
-	 break;
-      case TGSI_OPCODE_MAD:
-	 brw_MOV(p, brw_acc_reg(), args[2]);
-	 brw_MAC(p, dst, args[0], args[1]);
-	 break;
-      case TGSI_OPCODE_MAX:
-	 emit_max(p, dst, args[0], args[1]);
-	 break;
-      case TGSI_OPCODE_MIN:
-	 emit_min(p, dst, args[0], args[1]);
-	 break;
-      case TGSI_OPCODE_MOV:
-	 brw_MOV(p, dst, args[0]);
-	 break;
-      case TGSI_OPCODE_MUL:
-	 brw_MUL(p, dst, args[0], args[1]);
-	 break;
-      case TGSI_OPCODE_POW:
-	 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL); 
-	 break;
-      case TGSI_OPCODE_RCP:
-	 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
-	 break;
-      case TGSI_OPCODE_RSQ:
-	 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
-	 break;
-      case TGSI_OPCODE_SEQ:
-         emit_seq(p, dst, args[0], args[1]);
-         break;
-      case TGSI_OPCODE_SIN:
-	 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
-	 break;
-      case TGSI_OPCODE_SNE:
-         emit_sne(p, dst, args[0], args[1]);
-         break;
-      case TGSI_OPCODE_SGE:
-	 emit_sge(p, dst, args[0], args[1]);
-	 break;
-      case TGSI_OPCODE_SGT:
-         emit_sgt(p, dst, args[0], args[1]);
-         break;
-      case TGSI_OPCODE_SLT:
-	 emit_slt(p, dst, args[0], args[1]);
-	 break;
-      case TGSI_OPCODE_SLE:
-         emit_sle(p, dst, args[0], args[1]);
-         break;
-      case TGSI_OPCODE_SUB:
-	 brw_ADD(p, dst, args[0], negate(args[1]));
-	 break;
-      case TGSI_OPCODE_TRUNC:
-         /* round toward zero */
-	 brw_RNDZ(p, dst, args[0]);
-	 break;
-      case TGSI_OPCODE_XPD:
-	 emit_xpd(p, dst, args[0], args[1]);
-	 break;
-      case TGSI_OPCODE_IF:
-	 assert(if_depth < MAX_IF_DEPTH);
-	 if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
-	 /* Note that brw_IF smashes the predicate_control field. */
-	 if_inst[if_depth]->header.predicate_control = get_predicate(inst);
-	 if_depth++;
-	 break;
-      case TGSI_OPCODE_ELSE:
-	 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
-	 break;
-      case TGSI_OPCODE_ENDIF:
-         assert(if_depth > 0);
-	 brw_ENDIF(p, if_inst[--if_depth]);
-	 break;			
-      case TGSI_OPCODE_BGNLOOP:
-         loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
-         break;
-      case TGSI_OPCODE_BRK:
-	 brw_set_predicate_control(p, get_predicate(inst));
-         brw_BREAK(p);
-	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-         break;
-      case TGSI_OPCODE_CONT:
-	 brw_set_predicate_control(p, get_predicate(inst));
-         brw_CONT(p);
-         brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-         break;
-      case TGSI_OPCODE_ENDLOOP: 
-         {
-            struct brw_instruction *inst0, *inst1;
-	    GLuint br = 1;
-
-            loop_depth--;
-
-	    if (BRW_IS_IGDNG(brw))
-	       br = 2;
-
-            inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
-            /* patch all the BREAK/CONT instructions from last BEGINLOOP */
-            while (inst0 > loop_inst[loop_depth]) {
-               inst0--;
-               if (inst0->header.opcode == BRW_TGSI_OPCODE_BREAK) {
-                  inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
-                  inst0->bits3.if_else.pop_count = 0;
-               }
-               else if (inst0->header.opcode == BRW_TGSI_OPCODE_CONTINUE) {
-                  inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
-                  inst0->bits3.if_else.pop_count = 0;
-               }
-            }
-         }
-         break;
-      case TGSI_OPCODE_BRA:
-	 brw_set_predicate_control(p, get_predicate(inst));
-         brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
-	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-         break;
-      case TGSI_OPCODE_CAL:
-	 brw_set_access_mode(p, BRW_ALIGN_1);
-	 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
-	 brw_set_access_mode(p, BRW_ALIGN_16);
-	 brw_ADD(p, get_addr_reg(stack_index),
-			 get_addr_reg(stack_index), brw_imm_d(4));
-         brw_save_call(p, inst->Comment, p->nr_insn);
-	 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
-         break;
-      case TGSI_OPCODE_RET:
-	 brw_ADD(p, get_addr_reg(stack_index),
-			 get_addr_reg(stack_index), brw_imm_d(-4));
-	 brw_set_access_mode(p, BRW_ALIGN_1);
-         brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
-	 brw_set_access_mode(p, BRW_ALIGN_16);
-	 break;
-      case TGSI_OPCODE_END:	
-         end_offset = p->nr_insn;
-         /* this instruction will get patched later to jump past subroutine
-          * code, etc.
-          */
-         brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
-         break;
-      case TGSI_OPCODE_PRINT:
-         /* no-op */
-         break;
-      case TGSI_OPCODE_BGNSUB:
-         brw_save_label(p, inst->Comment, p->nr_insn);
-         break;
-      case TGSI_OPCODE_ENDSUB:
-         /* no-op */
-         break;
-      default:
-	 _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
-                       inst->Opcode, inst->Opcode < MAX_OPCODE ?
-				    _mesa_opcode_string(inst->Opcode) :
-				    "unknown");
-      }
-
-      /* Set the predication update on the last instruction of the native
-       * instruction sequence.
-       *
-       * This would be problematic if it was set on a math instruction,
-       * but that shouldn't be the case with the current GLSL compiler.
-       */
-      if (inst->CondUpdate) {
-	 struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
-
-	 assert(hw_insn->header.destreg__conditionalmod == 0);
-	 hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
-      }
-
-      release_tmps(c);
    }
 
    end_inst = &p->store[end_offset];
diff --git a/src/gallium/drivers/i965/brw_wm_glsl.c b/src/gallium/drivers/i965/brw_wm_glsl.c
index 3118e615f91..23f7ba16fd0 100644
--- a/src/gallium/drivers/i965/brw_wm_glsl.c
+++ b/src/gallium/drivers/i965/brw_wm_glsl.c
@@ -167,8 +167,8 @@ get_reg(struct brw_wm_compile *c, int file, int index, int component,
 	case PROGRAM_PAYLOAD:
 	    break;
 	default:
-	    _mesa_problem(NULL, "Unexpected file in get_reg()");
-	    return brw_null_reg();
+	   debug_printf("Unexpected file in get_reg()");
+	   return brw_null_reg();
     }
 
     assert(index < 256);
diff --git a/src/gallium/drivers/i965/brw_wm_surface_state.c b/src/gallium/drivers/i965/brw_wm_surface_state.c
index e1ed6438dca..7157feb6f39 100644
--- a/src/gallium/drivers/i965/brw_wm_surface_state.c
+++ b/src/gallium/drivers/i965/brw_wm_surface_state.c
@@ -516,8 +516,11 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
 	 key.surface_format = BRW_SURFACEFORMAT_B4G4R4A4_UNORM;
 	 break;
       default:
-	 _mesa_problem(ctx, "Bad renderbuffer format: %d\n",
-		       irb->texformat->MesaFormat);
+	 debug_printf("Bad renderbuffer format: %d\n",
+		      irb->texformat->MesaFormat);
+	 assert(0);
+	 key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+	 return;
       }
       key.tiling = region->tiling;
       if (brw->intel.intelScreen->driScrnPriv->dri2.enabled) {
-- 
cgit v1.2.3


From 7ba2fe40fa092551f1c493d754c80ca93564d32b Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Tue, 27 Oct 2009 00:29:21 +0000
Subject: i965g: still working on compilation

---
 src/gallium/drivers/i965/brw_context.h |   1 +
 src/gallium/drivers/i965/brw_eu.c      |  18 ++---
 src/gallium/drivers/i965/brw_eu.h      |   4 +-
 src/gallium/drivers/i965/brw_vs.h      |   6 ++
 src/gallium/drivers/i965/brw_vs_emit.c | 131 ++++++++++++++++-----------------
 src/gallium/drivers/i965/brw_wm.h      |   9 +--
 src/gallium/drivers/i965/brw_wm_glsl.c |   2 +-
 7 files changed, 83 insertions(+), 88 deletions(-)

(limited to 'src/gallium/drivers/i965/brw_vs.h')

diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
index 8aaf895d20a..7b85363e9f0 100644
--- a/src/gallium/drivers/i965/brw_context.h
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -289,6 +289,7 @@ struct brw_vs_prog_data {
    GLuint nr_params;       /**< number of TGSI_FILE_CONSTANT's */
 
    GLboolean copy_edgeflag;
+   GLboolean writes_psiz;
 
    /* Used for calculating urb partitions:
     */
diff --git a/src/gallium/drivers/i965/brw_eu.c b/src/gallium/drivers/i965/brw_eu.c
index df49d4b72f7..1189a35b6f2 100644
--- a/src/gallium/drivers/i965/brw_eu.c
+++ b/src/gallium/drivers/i965/brw_eu.c
@@ -152,7 +152,7 @@ const GLuint *brw_get_program( struct brw_compile *p,
  */
 struct brw_glsl_label
 {
-   const char *name; /**< the label string */
+   GLuint label;     /**< the label number */
    GLuint position;  /**< the position of the brw instruction for this label */
    struct brw_glsl_label *next;  /**< next in linked list */
 };
@@ -164,7 +164,7 @@ struct brw_glsl_label
 struct brw_glsl_call
 {
    GLuint call_inst_pos;  /**< location of the CAL instruction */
-   const char *sub_name;  /**< name of subroutine to call */
+   GLuint label;
    struct brw_glsl_call *next;  /**< next in linked list */
 };
 
@@ -173,10 +173,10 @@ struct brw_glsl_call
  * Called for each OPCODE_BGNSUB.
  */
 void
-brw_save_label(struct brw_compile *c, const char *name, GLuint position)
+brw_save_label(struct brw_compile *c, unsigned l, GLuint position)
 {
    struct brw_glsl_label *label = CALLOC_STRUCT(brw_glsl_label);
-   label->name = name;
+   label->label = l;
    label->position = position;
    label->next = c->first_label;
    c->first_label = label;
@@ -187,11 +187,11 @@ brw_save_label(struct brw_compile *c, const char *name, GLuint position)
  * Called for each OPCODE_CAL.
  */
 void
-brw_save_call(struct brw_compile *c, const char *name, GLuint call_pos)
+brw_save_call(struct brw_compile *c, GLuint label, GLuint call_pos)
 {
    struct brw_glsl_call *call = CALLOC_STRUCT(brw_glsl_call);
    call->call_inst_pos = call_pos;
-   call->sub_name = name;
+   call->label = label;
    call->next = c->first_call;
    c->first_call = call;
 }
@@ -201,11 +201,11 @@ brw_save_call(struct brw_compile *c, const char *name, GLuint call_pos)
  * Lookup a label, return label's position/offset.
  */
 static GLuint
-brw_lookup_label(struct brw_compile *c, const char *name)
+brw_lookup_label(struct brw_compile *c, unsigned l)
 {
    const struct brw_glsl_label *label;
    for (label = c->first_label; label; label = label->next) {
-      if (strcmp(name, label->name) == 0) {
+      if (l == label->label) {
          return label->position;
       }
    }
@@ -224,7 +224,7 @@ brw_resolve_cals(struct brw_compile *c)
     const struct brw_glsl_call *call;
 
     for (call = c->first_call; call; call = call->next) {
-        const GLuint sub_loc = brw_lookup_label(c, call->sub_name);
+        const GLuint sub_loc = brw_lookup_label(c, call->label);
 	struct brw_instruction *brw_call_inst = &c->store[call->call_inst_pos];
 	struct brw_instruction *brw_sub_inst = &c->store[sub_loc];
 	GLint offset = brw_sub_inst - brw_call_inst;
diff --git a/src/gallium/drivers/i965/brw_eu.h b/src/gallium/drivers/i965/brw_eu.h
index ac5a623cac6..3379522104a 100644
--- a/src/gallium/drivers/i965/brw_eu.h
+++ b/src/gallium/drivers/i965/brw_eu.h
@@ -136,10 +136,10 @@ struct brw_compile {
 
 
 void
-brw_save_label(struct brw_compile *c, const char *name, GLuint position);
+brw_save_label(struct brw_compile *c, unsigned label, GLuint position);
 
 void
-brw_save_call(struct brw_compile *c, const char *name, GLuint call_pos);
+brw_save_call(struct brw_compile *c, unsigned label, GLuint call_pos);
 
 void
 brw_resolve_cals(struct brw_compile *c);
diff --git a/src/gallium/drivers/i965/brw_vs.h b/src/gallium/drivers/i965/brw_vs.h
index 58119567dcf..2a2dbb3457c 100644
--- a/src/gallium/drivers/i965/brw_vs.h
+++ b/src/gallium/drivers/i965/brw_vs.h
@@ -54,6 +54,7 @@ struct brw_vs_compile {
    struct brw_compile func;
    struct brw_vs_prog_key key;
    struct brw_vs_prog_data prog_data;
+   struct brw_chipset chipset;
 
    struct brw_vertex_shader *vp;
 
@@ -88,7 +89,12 @@ struct brw_vs_compile {
 
    struct brw_instruction *if_inst[MAX_IF_DEPTH];
    struct brw_instruction *loop_inst[MAX_LOOP_DEPTH];
+   GLuint insn;
+   GLuint if_depth;
+   GLuint loop_depth;
+   GLuint end_offset;
 
+   struct brw_indirect stack_index;
 };
 
 
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
index 4daa98b29e4..5366ab85140 100644
--- a/src/gallium/drivers/i965/brw_vs_emit.c
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -35,19 +35,15 @@
 #include "util/u_math.h"
 
 #include "tgsi/tgsi_ureg.h"
+#include "tgsi/tgsi_ureg_parse.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_info.h"
 
 #include "brw_context.h"
 #include "brw_vs.h"
 #include "brw_debug.h"
 
 
-struct ureg_instruction {
-   unsigned opcode:8;
-   unsigned tex_target:3;
-   struct ureg_dst dst;
-   struct ureg_src src[3];
-};
-
 
 static struct brw_reg get_tmp( struct brw_vs_compile *c )
 {
@@ -149,7 +145,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    c->first_output = reg;
    c->first_overflow_output = 0;
 
-   if (BRW_IS_IGDNG(c->func.brw))
+   if (c->chipset.is_igdng)
       mrf = 8;
    else
       mrf = 4;
@@ -251,7 +247,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
     */
    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 
-   if (BRW_IS_IGDNG(c->func.brw))
+   if (c->chipset.is_igdng)
       c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
    else
       c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
@@ -1058,7 +1054,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
     */
    if (c->prog_data.writes_psiz ||
        c->key.nr_userclip || 
-       BRW_IS_965(p->brw))
+       c->chipset.is_965)
    {
       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
       GLuint i;
@@ -1089,7 +1085,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
        * Later, clipping will detect ucp[6] and ensure the primitive is
        * clipped against all fixed planes.
        */
-      if (BRW_IS_965(p->brw)) {
+      if (c->chipset.is_965) {
 	 brw_CMP(p,
 		 vec8(brw_null_reg()),
 		 BRW_CONDITIONAL_L,
@@ -1117,7 +1113,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
    brw_set_access_mode(p, BRW_ALIGN_1);
    brw_MOV(p, offset(m0, 2), ndc);
 
-   if (BRW_IS_IGDNG(p->brw)) {
+   if (c->chipset.is_igdng) {
        /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */
        brw_MOV(p, offset(m0, 3), pos); /* a portion of vertex header */
        /* m4, m5 contain the distances from vertex to the user clip planeXXX. 
@@ -1205,6 +1201,9 @@ post_vs_emit( struct brw_vs_compile *c,
 static uint32_t
 get_predicate(const struct ureg_instruction *inst)
 {
+   /* XXX: disabling for now
+    */
+#if 0
    if (inst->dst.CondMask == COND_TR)
       return BRW_PREDICATE_NONE;
 
@@ -1237,11 +1236,15 @@ get_predicate(const struct ureg_instruction *inst)
 		    inst->dst.CondMask);
       return BRW_PREDICATE_NORMAL;
    }
+#else
+   return BRW_PREDICATE_NORMAL;
+#endif
 }
 
 static void emit_insn(struct brw_vs_compile *c,
-		      const struct tgsi_full_instruction *insn)
+		      const struct ureg_instruction *inst)
 {
+   struct brw_compile *p = &c->func;
    struct brw_reg args[3], dst;
    GLuint i;
 
@@ -1253,9 +1256,6 @@ static void emit_insn(struct brw_vs_compile *c,
    /* Get argument regs.
     */
    for (i = 0; i < 3; i++) {
-      const struct ureg_src src = inst->src[i];
-      index = src.Index;
-      file = src.File;	
       args[i] = get_arg(c, inst, i);
    }
 
@@ -1263,16 +1263,13 @@ static void emit_insn(struct brw_vs_compile *c,
     * dst and arg, given the static allocation of registers.  So
     * care needs to be taken emitting multi-operation instructions.
     */ 
-   index = inst->dst.Index;
-   file = inst->dst.File;
    dst = get_dst(c, inst->dst);
 
-   if (inst->SaturateMode != SATURATE_OFF) {
-      debug_printf("Unsupported saturate %d in vertex shader",
-		   inst->SaturateMode);
+   if (inst->dst.Saturate) {
+      debug_printf("Unsupported saturate in vertex shader");
    }
 
-   switch (inst->Opcode) {
+   switch (inst->opcode) {
    case TGSI_OPCODE_ABS:
       brw_MOV(p, dst, brw_abs(args[0]));
       break;
@@ -1291,7 +1288,7 @@ static void emit_insn(struct brw_vs_compile *c,
    case TGSI_OPCODE_DPH:
       brw_DPH(p, dst, args[0], args[1]);
       break;
-   case TGSI_OPCODE_NRM3:
+   case TGSI_OPCODE_NRM:
       emit_nrm(c, dst, args[0], 3);
       break;
    case TGSI_OPCODE_NRM4:
@@ -1384,21 +1381,21 @@ static void emit_insn(struct brw_vs_compile *c,
       emit_xpd(p, dst, args[0], args[1]);
       break;
    case TGSI_OPCODE_IF:
-      assert(if_depth < MAX_IF_DEPTH);
-      if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
+      assert(c->if_depth < MAX_IF_DEPTH);
+      c->if_inst[c->if_depth] = brw_IF(p, BRW_EXECUTE_8);
       /* Note that brw_IF smashes the predicate_control field. */
-      if_inst[if_depth]->header.predicate_control = get_predicate(inst);
-      if_depth++;
+      c->if_inst[c->if_depth]->header.predicate_control = get_predicate(inst);
+      c->if_depth++;
       break;
    case TGSI_OPCODE_ELSE:
-      if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
+      c->if_inst[c->if_depth-1] = brw_ELSE(p, c->if_inst[c->if_depth-1]);
       break;
    case TGSI_OPCODE_ENDIF:
-      assert(if_depth > 0);
-      brw_ENDIF(p, if_inst[--if_depth]);
+      assert(c->if_depth > 0);
+      brw_ENDIF(p, c->if_inst[--c->if_depth]);
       break;			
    case TGSI_OPCODE_BGNLOOP:
-      loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
+      c->loop_inst[c->loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
       break;
    case TGSI_OPCODE_BRK:
       brw_set_predicate_control(p, get_predicate(inst));
@@ -1415,14 +1412,14 @@ static void emit_insn(struct brw_vs_compile *c,
       struct brw_instruction *inst0, *inst1;
       GLuint br = 1;
 
-      loop_depth--;
+      c->loop_depth--;
 
-      if (BRW_IS_IGDNG(brw))
+      if (c->chipset.is_igdng)
 	 br = 2;
 
-      inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
+      inst0 = inst1 = brw_WHILE(p, c->loop_inst[c->loop_depth]);
       /* patch all the BREAK/CONT instructions from last BEGINLOOP */
-      while (inst0 > loop_inst[loop_depth]) {
+      while (inst0 > c->loop_inst[c->loop_depth]) {
 	 inst0--;
 	 if (inst0->header.opcode == TGSI_OPCODE_BRK) {
 	    inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
@@ -1442,41 +1439,37 @@ static void emit_insn(struct brw_vs_compile *c,
       break;
    case TGSI_OPCODE_CAL:
       brw_set_access_mode(p, BRW_ALIGN_1);
-      brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
+      brw_ADD(p, deref_1d(c->stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
       brw_set_access_mode(p, BRW_ALIGN_16);
-      brw_ADD(p, get_addr_reg(stack_index),
-	      get_addr_reg(stack_index), brw_imm_d(4));
-      brw_save_call(p, inst->Comment, p->nr_insn);
+      brw_ADD(p, get_addr_reg(c->stack_index),
+	      get_addr_reg(c->stack_index), brw_imm_d(4));
+      brw_save_call(p, inst->label, p->nr_insn);
       brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
       break;
    case TGSI_OPCODE_RET:
-      brw_ADD(p, get_addr_reg(stack_index),
-	      get_addr_reg(stack_index), brw_imm_d(-4));
+      brw_ADD(p, get_addr_reg(c->stack_index),
+	      get_addr_reg(c->stack_index), brw_imm_d(-4));
       brw_set_access_mode(p, BRW_ALIGN_1);
-      brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
+      brw_MOV(p, brw_ip_reg(), deref_1d(c->stack_index, 0));
       brw_set_access_mode(p, BRW_ALIGN_16);
       break;
    case TGSI_OPCODE_END:	
-      end_offset = p->nr_insn;
+      c->end_offset = p->nr_insn;
       /* this instruction will get patched later to jump past subroutine
        * code, etc.
        */
       brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
       break;
-   case TGSI_OPCODE_PRINT:
-      /* no-op */
-      break;
    case TGSI_OPCODE_BGNSUB:
-      brw_save_label(p, inst->Comment, p->nr_insn);
+      brw_save_label(p, p->nr_insn, p->nr_insn);
       break;
    case TGSI_OPCODE_ENDSUB:
       /* no-op */
       break;
    default:
       debug_printf("Unsupported opcode %i (%s) in vertex shader",
-		   inst->Opcode, inst->Opcode < MAX_OPCODE ?
-		   _mesa_opcode_string(inst->Opcode) :
-		   "unknown");
+		   inst->opcode, 
+		   tgsi_get_opcode_name(inst->opcode));
    }
 
    /* Set the predication update on the last instruction of the native
@@ -1485,12 +1478,16 @@ static void emit_insn(struct brw_vs_compile *c,
     * This would be problematic if it was set on a math instruction,
     * but that shouldn't be the case with the current GLSL compiler.
     */
+#if 0
+   /* XXX: disabled
+    */
    if (inst->CondUpdate) {
       struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
 
       assert(hw_insn->header.destreg__conditionalmod == 0);
       hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
    }
+#endif
 
    release_tmps(c);
 }
@@ -1498,24 +1495,19 @@ static void emit_insn(struct brw_vs_compile *c,
 
 /* Emit the vertex program instructions here.
  */
-void brw_vs_emit(struct brw_vs_compile *c )
+void brw_vs_emit(struct brw_vs_compile *c)
 {
    struct brw_compile *p = &c->func;
-   struct brw_context *brw = p->brw;
-   GLuint insn, if_depth = 0, loop_depth = 0;
-   GLuint end_offset = 0;
    struct brw_instruction *end_inst, *last_inst;
-   const struct brw_indirect stack_index = brw_indirect(0, 0);   
-   struct tgsi_parse_context parse;
-   struct tgsi_full_declaration *decl;
-   GLuint index;
-   GLuint file;
+   struct ureg_parse_context parse;
+   struct ureg_declaration *decl;
+   struct ureg_declaration *imm;
+   struct ureg_declaration *insn;
 
-   if (BRW_DEBUG & DEBUG_VS) {
-      debug_printf("vs-mesa:\n");
-      _mesa_print_program(&c->vp->program.Base); 
-      debug_printf("\n");
-   }
+   if (BRW_DEBUG & DEBUG_VS)
+      tgsi_dump(c->vp->tokens, 0); 
+
+   c->stack_index = brw_indirect(0, 0);
 
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_set_access_mode(p, BRW_ALIGN_16);
@@ -1523,12 +1515,15 @@ void brw_vs_emit(struct brw_vs_compile *c )
    /* Static register allocation
     */
    brw_vs_alloc_regs(c);
-   brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
+   brw_MOV(p, get_addr_reg(c->stack_index), brw_address(c->stack));
 
-   for (insn = 0; insn < nr_insns; insn++) {
+   while (ureg_next_decl(&parse, &decl)) {
+   }
 
-      const struct ureg_instruction *inst = &c->vp->program.Base.Instructions[insn];
-      
+   while (ureg_next_immediate(&parse, &imm)) {
+   }
+
+   while (ureg_next_instruction(&parse, &insn)) {
    }
 
    end_inst = &p->store[end_offset];
diff --git a/src/gallium/drivers/i965/brw_wm.h b/src/gallium/drivers/i965/brw_wm.h
index 5bc2a49c1f4..084430cf282 100644
--- a/src/gallium/drivers/i965/brw_wm.h
+++ b/src/gallium/drivers/i965/brw_wm.h
@@ -34,6 +34,7 @@
 #define BRW_WM_H
 
 #include "tgsi/tgsi_ureg.h"
+#include "tgsi/tgsi_ureg_parse.h"
 
 #include "brw_context.h"
 #include "brw_eu.h"
@@ -163,14 +164,6 @@ struct brw_wm_instruction {
 #define BRW_WM_MAX_SUBROUTINE 16
 
 
-struct ureg_instruction {
-   unsigned opcode:8;
-   unsigned tex_target:3;
-   struct ureg_dst dst;
-   struct ureg_src src[3];
-};
-
-
 /* New opcodes to track internal operations required for WM unit.
  * These are added early so that the registers used can be tracked,
  * freed and reused like those of other instructions.
diff --git a/src/gallium/drivers/i965/brw_wm_glsl.c b/src/gallium/drivers/i965/brw_wm_glsl.c
index 23f7ba16fd0..59bc4ef701e 100644
--- a/src/gallium/drivers/i965/brw_wm_glsl.c
+++ b/src/gallium/drivers/i965/brw_wm_glsl.c
@@ -1867,7 +1867,7 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
                 brw_set_access_mode(p, BRW_ALIGN_16);
                 brw_ADD(p, get_addr_reg(stack_index),
                          get_addr_reg(stack_index), brw_imm_d(4));
-		brw_save_call(&c->func, inst->Comment, p->nr_insn);
+		brw_save_call(&c->func, inst->label, p->nr_insn);
                 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
                 brw_pop_insn_state(p);
 		break;
-- 
cgit v1.2.3


From 81b8589f064204d9ddcd7d1f9d43d2dcf5676235 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Wed, 28 Oct 2009 21:24:03 +0000
Subject: i965g: still working on compilation

---
 src/gallium/drivers/i965/brw_vs.c               |   2 +-
 src/gallium/drivers/i965/brw_vs.h               |   3 +
 src/gallium/drivers/i965/brw_vs_emit.c          | 199 +++++++++++++++++-------
 src/gallium/drivers/i965/brw_vs_state.c         |  25 +--
 src/gallium/drivers/i965/brw_vs_surface_state.c |  33 +++-
 src/gallium/drivers/i965/brw_wm.c               |  36 +++--
 6 files changed, 208 insertions(+), 90 deletions(-)

(limited to 'src/gallium/drivers/i965/brw_vs.h')

diff --git a/src/gallium/drivers/i965/brw_vs.c b/src/gallium/drivers/i965/brw_vs.c
index 3965ca6c54e..26a28114d98 100644
--- a/src/gallium/drivers/i965/brw_vs.c
+++ b/src/gallium/drivers/i965/brw_vs.c
@@ -57,7 +57,7 @@ static void do_vs_prog( struct brw_context *brw,
    c.prog_data.nr_inputs = vp->info.num_inputs;
    c.prog_data.copy_edgeflag = c.key.copy_edgeflag;
 
-   if (0)
+   if (1)
       tgsi_dump(c.vp->tokens, 0);
 
    /* Emit GEN4 code.
diff --git a/src/gallium/drivers/i965/brw_vs.h b/src/gallium/drivers/i965/brw_vs.h
index 2a2dbb3457c..b4e450d89bf 100644
--- a/src/gallium/drivers/i965/brw_vs.h
+++ b/src/gallium/drivers/i965/brw_vs.h
@@ -60,6 +60,9 @@ struct brw_vs_compile {
 
    GLuint nr_inputs;
    GLuint nr_outputs;
+   GLuint nr_immediates;
+   GLfloat immediate[128][4];
+
    GLboolean copy_edgeflag;
 
    GLuint first_output;
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
index 5366ab85140..6809bccdecc 100644
--- a/src/gallium/drivers/i965/brw_vs_emit.c
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -34,8 +34,7 @@
 #include "util/u_memory.h"
 #include "util/u_math.h"
 
-#include "tgsi/tgsi_ureg.h"
-#include "tgsi/tgsi_ureg_parse.h"
+#include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_info.h"
 
@@ -67,6 +66,7 @@ static void release_tmps( struct brw_vs_compile *c )
 }
 
 
+
 /**
  * Preallocate GRF register before code emit.
  * Do things as simply as possible.  Allocate and populate all regs
@@ -83,10 +83,17 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
     * XXX this heuristic/check may need some fine tuning...
     */
    if (c->vp->info.file_max[TGSI_FILE_CONSTANT] +
+       c->vp->info.file_max[TGSI_FILE_IMMEDIATE] +
        c->vp->info.file_max[TGSI_FILE_TEMPORARY] + 21 > BRW_MAX_GRF)
       c->vp->use_const_buffer = GL_TRUE;
-   else
+   else {
+      /* XXX: immediates can go elsewhere if necessary:
+       */
+      assert(c->vp->info.file_max[TGSI_FILE_IMMEDIATE] +
+	     c->vp->info.file_max[TGSI_FILE_TEMPORARY] + 21 > BRW_MAX_GRF);
+
       c->vp->use_const_buffer = GL_FALSE;
+   }
 
    /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
 
@@ -139,6 +146,29 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    if (c->nr_inputs == 0)
       reg++;
 
+   /* Allocate a GRF and load immediate values by hand with 4 MOVs!!!
+    *
+    * XXX: Try to encode float immediates as brw immediates
+    * XXX: Put immediates into the CURBE.
+    * XXX: Make sure ureg sets minimal immediate size and respect it
+    * here.
+    */
+   for (i = 0; i < c->nr_immediates; i++) {
+      struct brw_reg r;
+      int j;
+
+      r = brw_vec8_grf(reg, 0);
+
+      for (j = 0; j < 4; j++) {
+	 brw_MOV(&c->func, 
+		 brw_writemask(r, (1<<j)), 
+		 brw_imm_f(c->immediate[i][j]));
+      }
+
+      reg++;
+   }
+
+
    /* Allocate outputs.  The non-position outputs go straight into message regs.
     */
    c->nr_outputs = 0;
@@ -754,21 +784,20 @@ static void emit_nrm( struct brw_vs_compile *c,
 
 static struct brw_reg
 get_constant(struct brw_vs_compile *c,
-             const struct ureg_instruction *inst,
-             GLuint argIndex)
+	     GLuint argIndex,
+	     GLuint index,
+	     GLboolean relAddr)
 {
-   const struct ureg_src src = inst->src[argIndex];
    struct brw_compile *p = &c->func;
    struct brw_reg const_reg;
    struct brw_reg const2_reg;
-   const GLboolean relAddr = src.Indirect;
 
    assert(argIndex < 3);
 
-   if (c->current_const[argIndex].index != src.Index || relAddr) {
+   if (c->current_const[argIndex].index != index || relAddr) {
       struct brw_reg addrReg = c->regs[TGSI_FILE_ADDRESS][0];
 
-      c->current_const[argIndex].index = src.Index;
+      c->current_const[argIndex].index = index;
 
 #if 0
       printf("  fetch const[%d] for arg %d into reg %d\n",
@@ -780,7 +809,7 @@ get_constant(struct brw_vs_compile *c,
                        0,                             /* oword */
                        relAddr,                       /* relative indexing? */
                        addrReg,                       /* address register */
-                       16 * src.Index,               /* byte offset */
+                       16 * index,               /* byte offset */
                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
                        );
 
@@ -797,7 +826,7 @@ get_constant(struct brw_vs_compile *c,
                           1,                       /* oword */
                           relAddr,                 /* relative indexing? */
                           addrReg,                 /* address register */
-                          16 * src.Index,         /* byte offset */
+                          16 * index,         /* byte offset */
                           SURF_INDEX_VERT_CONST_BUFFER
                           );
       }
@@ -894,12 +923,11 @@ static struct brw_reg deref( struct brw_vs_compile *c,
  */
 static struct brw_reg
 get_src_reg( struct brw_vs_compile *c,
-             const struct ureg_instruction *inst,
-             GLuint argIndex )
+	     GLuint argIndex,
+	     GLuint file,
+	     GLint index,
+	     GLboolean relAddr )
 {
-   const GLuint file = inst->src[argIndex].File;
-   const GLint index = inst->src[argIndex].Index;
-   const GLboolean relAddr = inst->src[argIndex].Indirect;
 
    switch (file) {
    case TGSI_FILE_TEMPORARY:
@@ -913,9 +941,12 @@ get_src_reg( struct brw_vs_compile *c,
          return c->regs[file][index];
       }
 
+   case TGSI_FILE_IMMEDIATE:
+      return c->regs[file][index];
+
    case TGSI_FILE_CONSTANT:
       if (c->vp->use_const_buffer) {
-         return get_constant(c, inst, argIndex);
+         return get_constant(c, argIndex, index, relAddr);
       }
       else if (relAddr) {
          return deref(c, c->regs[TGSI_FILE_CONSTANT][0], index);
@@ -962,27 +993,32 @@ static void emit_arl( struct brw_vs_compile *c,
  * Return the brw reg for the given instruction's src argument.
  */
 static struct brw_reg get_arg( struct brw_vs_compile *c,
-                               const struct ureg_instruction *inst,
+                               const struct tgsi_full_src_register *src,
                                GLuint argIndex )
 {
-   const struct ureg_src src = inst->src[argIndex];
    struct brw_reg reg;
 
-   if (src.File == TGSI_FILE_NULL)
+   if (src->SrcRegister.File == TGSI_FILE_NULL)
       return brw_null_reg();
 
-   reg = get_src_reg(c, inst, argIndex);
+   reg = get_src_reg(c, argIndex,
+		     src->SrcRegister.File,
+		     src->SrcRegister.Index,
+		     src->SrcRegister.Indirect);
 
    /* Convert 3-bit swizzle to 2-bit.  
     */
-   reg.dw1.bits.swizzle = BRW_SWIZZLE4(src.SwizzleX,
-				       src.SwizzleY,
-				       src.SwizzleZ,
-				       src.SwizzleW);
+   reg.dw1.bits.swizzle = BRW_SWIZZLE4(src->SrcRegister.SwizzleX,
+				       src->SrcRegister.SwizzleY,
+				       src->SrcRegister.SwizzleZ,
+				       src->SrcRegister.SwizzleW);
 
    /* Note this is ok for non-swizzle instructions: 
     */
-   reg.negate = src.Negate ? 1 : 0;   
+   reg.negate = src->SrcRegister.Negate ? 1 : 0;   
+
+   /* XXX: abs, absneg
+    */
 
    return reg;
 }
@@ -992,19 +1028,21 @@ static struct brw_reg get_arg( struct brw_vs_compile *c,
  * Get brw register for the given program dest register.
  */
 static struct brw_reg get_dst( struct brw_vs_compile *c,
-			       struct ureg_dst dst )
+			       unsigned file,
+			       unsigned index,
+			       unsigned writemask )
 {
    struct brw_reg reg;
 
-   switch (dst.File) {
+   switch (file) {
    case TGSI_FILE_TEMPORARY:
    case TGSI_FILE_OUTPUT:
-      assert(c->regs[dst.File][dst.Index].nr != 0);
-      reg = c->regs[dst.File][dst.Index];
+      assert(c->regs[file][index].nr != 0);
+      reg = c->regs[file][index];
       break;
    case TGSI_FILE_ADDRESS:
-      assert(dst.Index == 0);
-      reg = c->regs[dst.File][dst.Index];
+      assert(index == 0);
+      reg = c->regs[file][index];
       break;
    case TGSI_FILE_NULL:
       /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
@@ -1015,7 +1053,7 @@ static struct brw_reg get_dst( struct brw_vs_compile *c,
       reg = brw_null_reg();
    }
 
-   reg.dw1.bits.writemask = dst.WriteMask;
+   reg.dw1.bits.writemask = writemask;
 
    return reg;
 }
@@ -1199,7 +1237,7 @@ post_vs_emit( struct brw_vs_compile *c,
 }
 
 static uint32_t
-get_predicate(const struct ureg_instruction *inst)
+get_predicate(const struct tgsi_full_instruction *inst)
 {
    /* XXX: disabling for now
     */
@@ -1242,8 +1280,10 @@ get_predicate(const struct ureg_instruction *inst)
 }
 
 static void emit_insn(struct brw_vs_compile *c,
-		      const struct ureg_instruction *inst)
+		      const struct tgsi_full_instruction *inst)
 {
+   unsigned opcode = inst->Instruction.Opcode;
+   unsigned label = inst->InstructionExtLabel.Label;
    struct brw_compile *p = &c->func;
    struct brw_reg args[3], dst;
    GLuint i;
@@ -1256,20 +1296,25 @@ static void emit_insn(struct brw_vs_compile *c,
    /* Get argument regs.
     */
    for (i = 0; i < 3; i++) {
-      args[i] = get_arg(c, inst, i);
+      args[i] = get_arg(c, &inst->FullSrcRegisters[i], i);
    }
 
    /* Get dest regs.  Note that it is possible for a reg to be both
     * dst and arg, given the static allocation of registers.  So
     * care needs to be taken emitting multi-operation instructions.
     */ 
-   dst = get_dst(c, inst->dst);
+   dst = get_dst(c, 
+		 inst->FullDstRegisters[0].DstRegister.File,
+		 inst->FullDstRegisters[0].DstRegister.Index,
+		 inst->FullDstRegisters[0].DstRegister.WriteMask);
 
-   if (inst->dst.Saturate) {
+   /* XXX: saturate
+    */
+   if (inst->Instruction.Saturate != TGSI_SAT_NONE) {
       debug_printf("Unsupported saturate in vertex shader");
    }
 
-   switch (inst->opcode) {
+   switch (opcode) {
    case TGSI_OPCODE_ABS:
       brw_MOV(p, dst, brw_abs(args[0]));
       break;
@@ -1443,7 +1488,7 @@ static void emit_insn(struct brw_vs_compile *c,
       brw_set_access_mode(p, BRW_ALIGN_16);
       brw_ADD(p, get_addr_reg(c->stack_index),
 	      get_addr_reg(c->stack_index), brw_imm_d(4));
-      brw_save_call(p, inst->label, p->nr_insn);
+      brw_save_call(p, label, p->nr_insn);
       brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
       break;
    case TGSI_OPCODE_RET:
@@ -1468,8 +1513,8 @@ static void emit_insn(struct brw_vs_compile *c,
       break;
    default:
       debug_printf("Unsupported opcode %i (%s) in vertex shader",
-		   inst->opcode, 
-		   tgsi_get_opcode_name(inst->opcode));
+		   opcode, 
+		   tgsi_get_opcode_name(opcode));
    }
 
    /* Set the predication update on the last instruction of the native
@@ -1498,11 +1543,12 @@ static void emit_insn(struct brw_vs_compile *c,
 void brw_vs_emit(struct brw_vs_compile *c)
 {
    struct brw_compile *p = &c->func;
+   const struct tgsi_token *tokens = c->vp->tokens;
    struct brw_instruction *end_inst, *last_inst;
-   struct ureg_parse_context parse;
-   struct ureg_declaration *decl;
-   struct ureg_declaration *imm;
-   struct ureg_declaration *insn;
+   struct tgsi_parse_context parse;
+   struct tgsi_full_instruction *inst;
+   boolean done = FALSE;
+   int i;
 
    if (BRW_DEBUG & DEBUG_VS)
       tgsi_dump(c->vp->tokens, 0); 
@@ -1512,21 +1558,66 @@ void brw_vs_emit(struct brw_vs_compile *c)
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_set_access_mode(p, BRW_ALIGN_16);
    
+   /* Inputs */
+   tgsi_parse_init( &parse, tokens );
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+	 /* Nothing to do -- using info from tgsi_scan().
+	  */
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE: {
+	 static const float id[4] = {0,0,0,1};
+	 const float *imm = &parse.FullToken.FullImmediate.u[i].Float;
+	 unsigned size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
+
+	 for (i = 0; i < size; i++)
+	    c->immediate[c->nr_immediates][i] = imm[i];
+
+	 for ( ; i < 4; i++)
+	    c->immediate[c->nr_immediates][i] = id[i];
+
+	 c->nr_immediates++;
+	 break;
+      }
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+	 done = 1;
+	 break;
+      }
+   }
+
    /* Static register allocation
     */
    brw_vs_alloc_regs(c);
    brw_MOV(p, get_addr_reg(c->stack_index), brw_address(c->stack));
 
-   while (ureg_next_decl(&parse, &decl)) {
-   }
-
-   while (ureg_next_immediate(&parse, &imm)) {
-   }
-
-   while (ureg_next_instruction(&parse, &insn)) {
+   /* Instructions
+    */
+   tgsi_parse_init( &parse, tokens );
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+	 break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         inst = &parse.FullToken.FullInstruction;
+	 emit_insn( c, inst );
+         break;
+
+      default:
+         assert( 0 );
+      }
    }
+   tgsi_parse_free( &parse );
 
-   end_inst = &p->store[end_offset];
+   end_inst = &p->store[c->end_offset];
    last_inst = &p->store[p->nr_insn];
 
    /* The END instruction will be patched to jump to this code */
diff --git a/src/gallium/drivers/i965/brw_vs_state.c b/src/gallium/drivers/i965/brw_vs_state.c
index 05a91f2de44..549696f7ae7 100644
--- a/src/gallium/drivers/i965/brw_vs_state.c
+++ b/src/gallium/drivers/i965/brw_vs_state.c
@@ -29,8 +29,10 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
             
+#include "util/u_math.h"
 
 
+#include "brw_debug.h"
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
@@ -64,8 +66,8 @@ vs_unit_populate_key(struct brw_context *brw, struct brw_vs_unit_key *key)
    /* BRW_NEW_NR_VS_SURFACES */
    key->nr_surfaces = brw->vs.nr_surfaces;
 
-   /* BRW_NEW_CURBE_OFFSETS, _NEW_TRANSFORM */
-   if (ctx->Transform.ClipPlanesEnabled) {
+   /* PIPE_NEW_CLIP */
+   if (brw->curr.ucp.nr) {
       /* Note that we read in the userclip planes as well, hence
        * clip_start:
        */
@@ -86,7 +88,7 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
    memset(&vs, 0, sizeof(vs));
 
    vs.thread0.kernel_start_pointer = brw->vs.prog_bo->offset >> 6; /* reloc */
-   vs.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1;
+   vs.thread0.grf_reg_count = align(key->total_grf, 16) / 16 - 1;
    vs.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
    /* Choosing multiple program flow means that we may get 2-vertex threads,
     * which will have the channel mask for dwords 4-7 enabled in the thread,
@@ -119,6 +121,7 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
       chipset_max_threads = 32;
    else
       chipset_max_threads = 16;
+
    vs.thread4.max_threads = CLAMP(key->nr_urb_entries / 2,
 				  1, chipset_max_threads) - 1;
 
@@ -145,16 +148,16 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
 			 NULL, NULL);
 
    /* Emit VS program relocation */
-   dri_bo_emit_reloc(bo,
-		     I915_GEM_DOMAIN_INSTRUCTION, 0,
-		     vs.thread0.grf_reg_count << 1,
-		     offsetof(struct brw_vs_unit_state, thread0),
-		     brw->vs.prog_bo);
+   brw->sws->bo_emit_reloc(bo,
+			   I915_GEM_DOMAIN_INSTRUCTION, 0,
+			   vs.thread0.grf_reg_count << 1,
+			   offsetof(struct brw_vs_unit_state, thread0),
+			   brw->vs.prog_bo);
 
    return bo;
 }
 
-static void prepare_vs_unit(struct brw_context *brw)
+static int prepare_vs_unit(struct brw_context *brw)
 {
    struct brw_vs_unit_key key;
 
@@ -168,11 +171,13 @@ static void prepare_vs_unit(struct brw_context *brw)
    if (brw->vs.state_bo == NULL) {
       brw->vs.state_bo = vs_unit_create_from_key(brw, &key);
    }
+
+   return 0;
 }
 
 const struct brw_tracked_state brw_vs_unit = {
    .dirty = {
-      .mesa  = _NEW_TRANSFORM,
+      .mesa  = (PIPE_NEW_CLIP),
       .brw   = (BRW_NEW_CURBE_OFFSETS |
                 BRW_NEW_NR_VS_SURFACES |
 		BRW_NEW_URB_FENCE),
diff --git a/src/gallium/drivers/i965/brw_vs_surface_state.c b/src/gallium/drivers/i965/brw_vs_surface_state.c
index 319e29bfcbc..9a9d47a8a3e 100644
--- a/src/gallium/drivers/i965/brw_vs_surface_state.c
+++ b/src/gallium/drivers/i965/brw_vs_surface_state.c
@@ -32,6 +32,11 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
+#include "brw_winsys.h"
+
+/* XXX: disabled true constant buffer functionality
+ */
+
 
 /* Creates a new VS constant buffer reflecting the current VS program's
  * constants, if needed by the VS program.
@@ -39,9 +44,12 @@
  * Otherwise, constants go through the CURBEs using the brw_constant_buffer
  * state atom.
  */
-static drm_intel_bo *
+#if 0
+static struct brw_winsys_buffer *
 brw_vs_update_constant_buffer(struct brw_context *brw)
 {
+   /* XXX: true constant buffers
+    */
    struct brw_vertex_program *vp =
       (struct brw_vertex_program *) brw->vertex_program;
    const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
@@ -61,21 +69,20 @@ brw_vs_update_constant_buffer(struct brw_context *brw)
 
    return const_buffer;
 }
+#endif
 
 /**
  * Update the surface state for a VS constant buffer.
  *
  * Sets brw->vs.surf_bo[surf] and brw->vp->const_buffer.
  */
+#if 0
 static void
 brw_update_vs_constant_surface( struct brw_context *brw,
                                 GLuint surf)
 {
-   struct brw_context *brw = brw_context(ctx);
    struct brw_surface_key key;
-   struct brw_vertex_program *vp =
-      (struct brw_vertex_program *) brw->vertex_program;
-   const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
+   struct pipe_buffer *cb = brw->curr.vs_constants;
 
    assert(surf == 0);
 
@@ -121,6 +128,7 @@ brw_update_vs_constant_surface( struct brw_context *brw,
       brw->vs.surf_bo[surf] = brw_create_constant_surface(brw, &key);
    }
 }
+#endif
 
 
 /**
@@ -129,6 +137,7 @@ brw_update_vs_constant_surface( struct brw_context *brw,
 static struct brw_winsys_buffer *
 brw_vs_get_binding_table(struct brw_context *brw)
 {
+#if 0
    struct brw_winsys_buffer *bind_bo;
 
    bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
@@ -169,6 +178,9 @@ brw_vs_get_binding_table(struct brw_context *brw)
    }
 
    return bind_bo;
+#else
+   return NULL;
+#endif
 }
 
 /**
@@ -178,8 +190,9 @@ brw_vs_get_binding_table(struct brw_context *brw)
  * to be updated, and produces BRW_NEW_NR_VS_SURFACES for the VS unit and
  * CACHE_NEW_SURF_BIND for the binding table upload.
  */
-static void prepare_vs_surfaces(struct brw_context *brw )
+static int prepare_vs_surfaces(struct brw_context *brw )
 {
+#if 0
    int i;
    int nr_surfaces = 0;
 
@@ -195,6 +208,7 @@ static void prepare_vs_surfaces(struct brw_context *brw )
       brw->state.dirty.brw |= BRW_NEW_NR_VS_SURFACES;
       brw->vs.nr_surfaces = nr_surfaces;
    }
+#endif
 
    /* Note that we don't end up updating the bind_bo if we don't have a
     * surface to be pointing at.  This should be relatively harmless, as it
@@ -204,12 +218,15 @@ static void prepare_vs_surfaces(struct brw_context *brw )
       brw->sws->bo_unreference(brw->vs.bind_bo);
       brw->vs.bind_bo = brw_vs_get_binding_table(brw);
    }
+
+   return 0;
 }
 
 const struct brw_tracked_state brw_vs_surfaces = {
    .dirty = {
-      .mesa = (_NEW_PROGRAM_CONSTANTS),
-      .brw = (BRW_NEW_VERTEX_PROGRAM),
+      .mesa = (PIPE_NEW_VERTEX_CONSTANTS |
+	       PIPE_NEW_VERTEX_SHADER),
+      .brw = 0,
       .cache = 0
    },
    .prepare = prepare_vs_surfaces,
diff --git a/src/gallium/drivers/i965/brw_wm.c b/src/gallium/drivers/i965/brw_wm.c
index 3d889699f89..f0dabfcfd0e 100644
--- a/src/gallium/drivers/i965/brw_wm.c
+++ b/src/gallium/drivers/i965/brw_wm.c
@@ -28,11 +28,14 @@
   * Authors:
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
-             
+
+#include "tgsi/tgsi_info.h"
+
 #include "brw_context.h"
 #include "brw_util.h"
 #include "brw_wm.h"
 #include "brw_state.h"
+#include "brw_debug.h"
 
 
 /** Return number of src args for given instruction */
@@ -54,7 +57,7 @@ GLuint brw_wm_nr_args( GLuint opcode )
       return 3;
    default:
       assert(opcode < MAX_OPCODE);
-      return _mesa_num_inst_src_regs(opcode);
+      return tgsi_get_opcode_info(opcode)->num_src;
    }
 }
 
@@ -62,17 +65,17 @@ GLuint brw_wm_nr_args( GLuint opcode )
 GLuint brw_wm_is_scalar_result( GLuint opcode )
 {
    switch (opcode) {
-   case OPCODE_COS:
-   case OPCODE_EX2:
-   case OPCODE_LG2:
-   case OPCODE_POW:
-   case OPCODE_RCP:
-   case OPCODE_RSQ:
-   case OPCODE_SIN:
-   case OPCODE_DP3:
-   case OPCODE_DP4:
-   case OPCODE_DPH:
-   case OPCODE_DST:
+   case TGSI_OPCODE_COS:
+   case TGSI_OPCODE_EX2:
+   case TGSI_OPCODE_LG2:
+   case TGSI_OPCODE_POW:
+   case TGSI_OPCODE_RCP:
+   case TGSI_OPCODE_RSQ:
+   case TGSI_OPCODE_SIN:
+   case TGSI_OPCODE_DP3:
+   case TGSI_OPCODE_DP4:
+   case TGSI_OPCODE_DPH:
+   case TGSI_OPCODE_DST:
       return 1;
       
    default:
@@ -134,7 +137,7 @@ brw_wm_non_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
  * we'll use one of two code generators.
  */
 static void do_wm_prog( struct brw_context *brw,
-			struct brw_fragment_program *fp, 
+			struct brw_fragment_shader *fp, 
 			struct brw_wm_prog_key *key)
 {
    struct brw_wm_compile *c;
@@ -163,7 +166,7 @@ static void do_wm_prog( struct brw_context *brw,
    brw_init_compile(brw, &c->func);
 
    /* temporary sanity check assertion */
-   ASSERT(fp->isGLSL == brw_wm_is_glsl(&c->fp->program));
+   assert(fp->isGLSL == brw_wm_is_glsl(&c->fp->program));
 
    /*
     * Shader which use GLSL features such as flow control are handled
@@ -200,8 +203,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
 				 struct brw_wm_prog_key *key )
 {
    /* BRW_NEW_FRAGMENT_PROGRAM */
-   const struct brw_fragment_program *fp = 
-      (struct brw_fragment_program *)brw->fragment_program;
+   const struct brw_fragment_program *fp = brw->curr.fragment_shader;
    GLboolean uses_depth = (fp->program.Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
    GLuint lookup = 0;
    GLuint line_aa;
-- 
cgit v1.2.3


From 2f54d02d205468a840b35a3554f2ad8ffc31ec9c Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Tue, 10 Nov 2009 18:07:11 -0800
Subject: i965g: consult fs inputs when laying out vs output regs

Vertex shader now emits just the FS inputs, in the positions and order
expected by the fragment shader.

This means potentially regenerating the vertex shader to match
different fragment shader's input layouts.
---
 src/gallium/drivers/i965/brw_context.h     |  13 ++++
 src/gallium/drivers/i965/brw_pipe_shader.c |   6 ++
 src/gallium/drivers/i965/brw_vs.c          |  14 ++--
 src/gallium/drivers/i965/brw_vs.h          |   7 +-
 src/gallium/drivers/i965/brw_vs_emit.c     | 116 ++++++++++++++++++++---------
 5 files changed, 113 insertions(+), 43 deletions(-)

(limited to 'src/gallium/drivers/i965/brw_vs.h')

diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
index 4a975ecd7ec..31f3cf36855 100644
--- a/src/gallium/drivers/i965/brw_context.h
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -161,11 +161,24 @@ struct brw_vertex_shader {
    GLboolean use_const_buffer;
 };
 
+struct brw_fs_signature {
+   GLuint nr_inputs;
+   struct {
+      GLuint semantic:5;
+      GLuint semantic_index:27;
+   } input[PIPE_MAX_SHADER_INPUTS];
+};
+
+#define brw_fs_signature_size(s) (offsetof(struct brw_fs_signature, input) + \
+                                  ((s)->nr_inputs * sizeof (s)->input[0])) 
+
 
 struct brw_fragment_shader {
    const struct tgsi_token *tokens;
    struct tgsi_shader_info info;
 
+   struct brw_fs_signature signature;
+
    unsigned iz_lookup;
    //unsigned wm_lookup;
    
diff --git a/src/gallium/drivers/i965/brw_pipe_shader.c b/src/gallium/drivers/i965/brw_pipe_shader.c
index 44f9ad6f9cd..7febf9e0c2f 100644
--- a/src/gallium/drivers/i965/brw_pipe_shader.c
+++ b/src/gallium/drivers/i965/brw_pipe_shader.c
@@ -96,6 +96,12 @@ static void *brw_create_fs_state( struct pipe_context *pipe,
 
    tgsi_scan_shader(fs->tokens, &fs->info);
 
+   fs->signature.nr_inputs = fs->info.num_inputs;
+   for (i = 0; i < fs->info.num_inputs; i++) {
+      fs->signature.input[i].semantic = fs->info.input_semantic_name[i];
+      fs->signature.input[i].semantic_index = fs->info.input_semantic_index[i];
+   }
+
    for (i = 0; i < fs->info.num_inputs; i++)
       if (fs->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION)
 	 fs->uses_depth = 1;
diff --git a/src/gallium/drivers/i965/brw_vs.c b/src/gallium/drivers/i965/brw_vs.c
index 966940ceacb..05a62ed9745 100644
--- a/src/gallium/drivers/i965/brw_vs.c
+++ b/src/gallium/drivers/i965/brw_vs.c
@@ -90,22 +90,24 @@ static enum pipe_error brw_upload_vs_prog(struct brw_context *brw)
 {
    struct brw_vs_prog_key key;
    struct brw_vertex_shader *vp = brw->curr.vertex_shader;
+   struct brw_fragment_shader *fs = brw->curr.fragment_shader;
    enum pipe_error ret;
 
    memset(&key, 0, sizeof(key));
 
-   /* Just upload the program verbatim for now.  Always send it all
-    * the inputs it asks for, whether they are varying or not.
-    */
    key.program_string_id = vp->id;
    key.nr_userclip = brw->curr.ucp.nr;
    key.copy_edgeflag = (brw->curr.rast->templ.fill_ccw != PIPE_POLYGON_MODE_FILL ||
 			brw->curr.rast->templ.fill_cw != PIPE_POLYGON_MODE_FILL);
 
+   memcpy(&key.fs_signature, &fs->signature,
+          brw_fs_signature_size(&fs->signature));
+
+
    /* Make an early check for the key.
     */
    if (brw_search_cache(&brw->cache, BRW_VS_PROG,
-                        &key, sizeof(key),
+                        &key, brw_vs_prog_key_size(&key),
                         NULL, 0,
                         &brw->vs.prog_data,
                         &brw->vs.prog_bo))
@@ -123,7 +125,9 @@ static enum pipe_error brw_upload_vs_prog(struct brw_context *brw)
  */
 const struct brw_tracked_state brw_vs_prog = {
    .dirty = {
-      .mesa  = PIPE_NEW_CLIP | PIPE_NEW_RAST,
+      .mesa  = (PIPE_NEW_CLIP | 
+                PIPE_NEW_RAST |
+                PIPE_NEW_FRAGMENT_SHADER),
       .brw   = BRW_NEW_VERTEX_PROGRAM,
       .cache = 0
    },
diff --git a/src/gallium/drivers/i965/brw_vs.h b/src/gallium/drivers/i965/brw_vs.h
index b4e450d89bf..3d1598d02b9 100644
--- a/src/gallium/drivers/i965/brw_vs.h
+++ b/src/gallium/drivers/i965/brw_vs.h
@@ -43,8 +43,11 @@ struct brw_vs_prog_key {
    GLuint nr_userclip:4;
    GLuint copy_edgeflag:1;
    GLuint pad:26;
+   struct brw_fs_signature fs_signature;
 };
 
+#define brw_vs_prog_key_size(s) (offsetof(struct brw_vs_prog_key, fs_signature) + \
+                                 brw_fs_signature_size(&(s)->fs_signature))
 
 
 #define MAX_IF_DEPTH 32
@@ -65,8 +68,8 @@ struct brw_vs_compile {
 
    GLboolean copy_edgeflag;
 
-   GLuint first_output;
-   GLuint first_overflow_output; /**< VERT_ATTRIB_x */
+   GLuint overflow_grf_start;
+   GLuint overflow_count;
 
    GLuint first_tmp;
    GLuint last_tmp;
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
index 26f0ec5a11a..933c9c4d63c 100644
--- a/src/gallium/drivers/i965/brw_vs_emit.c
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -66,6 +66,38 @@ static void release_tmps( struct brw_vs_compile *c )
 }
 
 
+static boolean is_position_output( struct brw_vs_compile *c,
+                                   unsigned vs_output )
+{
+   struct brw_vertex_shader *vs = c->vp;
+   unsigned semantic = vs->info.output_semantic_name[vs_output];
+   unsigned index = vs->info.output_semantic_index[vs_output];
+
+   return (semantic == TGSI_SEMANTIC_POSITION &&
+           index == 0);
+}
+
+
+static boolean find_output_slot( struct brw_vs_compile *c,
+                                  unsigned vs_output,
+                                  unsigned *fs_input_slot )
+{
+   struct brw_vertex_shader *vs = c->vp;
+   unsigned semantic = vs->info.output_semantic_name[vs_output];
+   unsigned index = vs->info.output_semantic_index[vs_output];
+   unsigned i;
+
+   for (i = 0; i < c->key.fs_signature.nr_inputs; i++) {
+      if (c->key.fs_signature.input[i].semantic == semantic &&
+          c->key.fs_signature.input[i].semantic_index == index) {
+         *fs_input_slot = i;
+         return TRUE;
+      }
+   }
+
+   return FALSE;
+}
+
 
 /**
  * Preallocate GRF register before code emit.
@@ -172,42 +204,50 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    /* Allocate outputs.  The non-position outputs go straight into message regs.
     */
    c->nr_outputs = c->prog_data.nr_outputs;
-   c->first_output = reg;
-   c->first_overflow_output = 0;
 
    if (c->chipset.is_igdng)
       mrf = 8;
    else
       mrf = 4;
 
+   
+   if (c->key.fs_signature.nr_inputs > BRW_MAX_MRF) {
+      c->overflow_grf_start = reg;
+      c->overflow_count = c->key.fs_signature.nr_inputs - BRW_MAX_MRF;
+      reg += c->overflow_count;
+   }
+
    /* XXX: need to access vertex output semantics here:
     */
    for (i = 0; i < c->prog_data.nr_outputs; i++) {
-      assert(i < Elements(c->regs[TGSI_FILE_OUTPUT]));
+      unsigned slot;
 
-      /* XXX: Hardwire position to zero:
-       */
-      if (i == 0) {
-	 c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
-	 reg++;
-      }
-      /* XXX: disable psiz:
+      /* XXX: Put output position in slot zero always.  Clipper, etc,
+       * need access to this reg.
        */
-      else if (0) {
-	 c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
+      if (is_position_output(c, i)) {
+	 c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0); /* copy to mrf 0 */
 	 reg++;
-	 mrf++;		/* just a placeholder?  XXX fix later stages & remove this */
       }
-      else if (mrf < 16) {
-	 c->regs[TGSI_FILE_OUTPUT][i] = brw_message_reg(mrf);
-	 mrf++;
+      else if (find_output_slot(c, i, &slot)) {
+         
+         if (0 /* is_psize_output(c, i) */ ) {
+            /* c->psize_out.grf = reg; */
+            /* c->psize_out.mrf = i; */
+         }
+         
+         /* The first (16-4) outputs can go straight into the message regs.
+          */
+         if (slot + mrf < BRW_MAX_MRF) {
+            c->regs[TGSI_FILE_OUTPUT][i] = brw_message_reg(slot + mrf);
+         }
+         else {
+            int grf = c->overflow_grf_start + slot - BRW_MAX_MRF;
+            c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(grf, 0);
+         }
       }
       else {
-	 /* too many vertex results to fit in MRF, use GRF for overflow */
-	 if (!c->first_overflow_output)
-	    c->first_overflow_output = i;
-	 c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
-	 reg++;
+         c->regs[TGSI_FILE_OUTPUT][i] = brw_null_reg();
       }
    }     
 
@@ -1072,6 +1112,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
    struct brw_reg pos = c->regs[TGSI_FILE_OUTPUT][VERT_RESULT_HPOS];
    struct brw_reg ndc;
    int eot;
+   int i;
    GLuint len_vertext_header = 2;
 
    if (c->key.copy_edgeflag) {
@@ -1167,7 +1208,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
        len_vertext_header = 2;
    }
 
-   eot = (c->first_overflow_output == 0);
+   eot = (c->overflow_count == 0);
 
    brw_urb_WRITE(p, 
 		 brw_null_reg(), /* dest */
@@ -1182,19 +1223,22 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 		 0, 		/* urb destination offset */
 		 BRW_URB_SWIZZLE_INTERLEAVE);
 
-   if (c->first_overflow_output > 0) {
-      /* Not all of the vertex outputs/results fit into the MRF.
-       * Move the overflowed attributes from the GRF to the MRF and
-       * issue another brw_urb_WRITE().
-       */
+   /* Not all of the vertex outputs/results fit into the MRF.
+    * Move the overflowed attributes from the GRF to the MRF and
+    * issue another brw_urb_WRITE().
+    */
+   for (i = 0; i < c->overflow_count; i += BRW_MAX_MRF) {
+      unsigned nr = MIN2(c->overflow_count - i, BRW_MAX_MRF);
+      GLuint j;
+
+      eot = (i + nr >= c->overflow_count);
+
       /* XXX I'm not 100% sure about which MRF regs to use here.  Starting
        * at mrf[4] atm...
        */
-      GLuint i, mrf = 0;
-      for (i = c->first_overflow_output; i < c->prog_data.nr_outputs; i++) {
-	 /* move from GRF to MRF */
-	 brw_MOV(p, brw_message_reg(4+mrf), c->regs[TGSI_FILE_OUTPUT][i]);
-	 mrf++;
+      for (j = 0; j < nr; j++) {
+	 brw_MOV(p, brw_message_reg(4+j), 
+                 brw_vec8_grf(c->overflow_grf_start + i + j, 0));
       }
 
       brw_urb_WRITE(p,
@@ -1203,11 +1247,11 @@ static void emit_vertex_write( struct brw_vs_compile *c)
                     c->r0,          /* src */
                     0,              /* allocate */
                     1,              /* used */
-                    mrf+1,          /* msg len */
+                    nr+1,          /* msg len */
                     0,              /* response len */
-                    1,              /* eot */
-                    1,              /* writes complete */
-                    BRW_MAX_MRF-1,  /* urb destination offset */
+                    eot,            /* eot */
+                    eot,            /* writes complete */
+                    i-1,            /* urb destination offset */
                     BRW_URB_SWIZZLE_INTERLEAVE);
    }
 }
-- 
cgit v1.2.3


From d186079520234a776c3fa88c81da935d65981fec Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Tue, 22 Dec 2009 21:26:51 +0100
Subject: i965g: fix for edgeflag changes (untested)

---
 src/gallium/drivers/i965/brw_pipe_shader.c |  3 ---
 src/gallium/drivers/i965/brw_pipe_vertex.c |  7 -------
 src/gallium/drivers/i965/brw_vs.c          | 14 --------------
 src/gallium/drivers/i965/brw_vs.h          |  3 ---
 src/gallium/drivers/i965/brw_vs_emit.c     |  6 ------
 5 files changed, 33 deletions(-)

(limited to 'src/gallium/drivers/i965/brw_vs.h')

diff --git a/src/gallium/drivers/i965/brw_pipe_shader.c b/src/gallium/drivers/i965/brw_pipe_shader.c
index 31a715ab655..20f20571f65 100644
--- a/src/gallium/drivers/i965/brw_pipe_shader.c
+++ b/src/gallium/drivers/i965/brw_pipe_shader.c
@@ -215,15 +215,12 @@ static void *brw_create_vs_state( struct pipe_context *pipe,
          else
             vs->output_bfc1 = i;
          break;
-#if 0
       case TGSI_SEMANTIC_EDGEFLAG:
          vs->output_edgeflag = i;
          break;
-#endif
       }
    }
 
-
    
    /* Done:
     */
diff --git a/src/gallium/drivers/i965/brw_pipe_vertex.c b/src/gallium/drivers/i965/brw_pipe_vertex.c
index 3d87a2853f7..e3c48e31493 100644
--- a/src/gallium/drivers/i965/brw_pipe_vertex.c
+++ b/src/gallium/drivers/i965/brw_pipe_vertex.c
@@ -44,19 +44,12 @@ static void brw_set_vertex_buffers(struct pipe_context *pipe,
    brw->state.dirty.mesa |= PIPE_NEW_VERTEX_BUFFER;
 }
 
-static void brw_set_edgeflags( struct pipe_context *pipe,
-			       const unsigned *bitfield )
-{
-   /* XXX */
-}
-
 
 void 
 brw_pipe_vertex_init( struct brw_context *brw )
 {
    brw->base.set_vertex_buffers = brw_set_vertex_buffers;
    brw->base.set_vertex_elements = brw_set_vertex_elements;
-   brw->base.set_edgeflags = brw_set_edgeflags;
 }
 
 
diff --git a/src/gallium/drivers/i965/brw_vs.c b/src/gallium/drivers/i965/brw_vs.c
index 14a1c3bcf18..e3ea5a3a135 100644
--- a/src/gallium/drivers/i965/brw_vs.c
+++ b/src/gallium/drivers/i965/brw_vs.c
@@ -58,18 +58,6 @@ static enum pipe_error do_vs_prog( struct brw_context *brw,
    c.prog_data.nr_outputs = vp->info.num_outputs;
    c.prog_data.nr_inputs = vp->info.num_inputs;
 
-   /* XXX: we want edgeflag handling to be integrated to the vertex
-    * shader, but are currently faking the edgeflag output:
-    */
-   if (c.key.copy_edgeflag) {
-      c.prog_data.output_edgeflag = c.prog_data.nr_outputs;
-      c.prog_data.nr_outputs++;
-   }
-   else {
-      c.prog_data.output_edgeflag = ~0;
-   }
-
-
    if (1)
       tgsi_dump(c.vp->tokens, 0);
 
@@ -108,8 +96,6 @@ static enum pipe_error brw_upload_vs_prog(struct brw_context *brw)
 
    key.program_string_id = vp->id;
    key.nr_userclip = brw->curr.ucp.nr;
-   key.copy_edgeflag = (brw->curr.rast->templ.fill_ccw != PIPE_POLYGON_MODE_FILL ||
-			brw->curr.rast->templ.fill_cw != PIPE_POLYGON_MODE_FILL);
 
    memcpy(&key.fs_signature, sig, brw_fs_signature_size(sig));
 
diff --git a/src/gallium/drivers/i965/brw_vs.h b/src/gallium/drivers/i965/brw_vs.h
index 3d1598d02b9..944d88c84cc 100644
--- a/src/gallium/drivers/i965/brw_vs.h
+++ b/src/gallium/drivers/i965/brw_vs.h
@@ -41,7 +41,6 @@
 struct brw_vs_prog_key {
    GLuint program_string_id;
    GLuint nr_userclip:4;
-   GLuint copy_edgeflag:1;
    GLuint pad:26;
    struct brw_fs_signature fs_signature;
 };
@@ -66,8 +65,6 @@ struct brw_vs_compile {
    GLuint nr_immediates;
    GLfloat immediate[128][4];
 
-   GLboolean copy_edgeflag;
-
    GLuint overflow_grf_start;
    GLuint overflow_count;
 
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
index 1d0fff0d9ea..714def5046d 100644
--- a/src/gallium/drivers/i965/brw_vs_emit.c
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -1141,12 +1141,6 @@ static void emit_vertex_write( struct brw_vs_compile *c)
    int i;
    GLuint len_vertext_header = 2;
 
-   if (c->key.copy_edgeflag) {
-      brw_MOV(p, 
-              get_reg(c, TGSI_FILE_OUTPUT, c->prog_data.output_edgeflag),
-              brw_imm_f(1));
-   }
-
    /* Build ndc coords */
    ndc = get_tmp(c);
    /* ndc = 1.0 / pos.w */
-- 
cgit v1.2.3